{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 2000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 428.0, "learning_rate": 1.18e-05, "loss": 99.3619, "loss/crossentropy": 9.37970495223999, "loss/hidden": 16.625, "loss/jsd": 0.0, "loss/logits": 7.257124900817871, "step": 2 }, { "epoch": 0.00025, "grad_norm": 356.0, "learning_rate": 1.3600000000000002e-05, "loss": 97.1216, "loss/crossentropy": 9.159881591796875, "loss/hidden": 16.625, "loss/jsd": 0.0, "loss/logits": 7.145160675048828, "step": 4 }, { "epoch": 0.000375, "grad_norm": 380.0, "learning_rate": 1.54e-05, "loss": 100.1942, "loss/crossentropy": 9.186327457427979, "loss/hidden": 16.625, "loss/jsd": 0.0, "loss/logits": 7.2050676345825195, "step": 6 }, { "epoch": 0.0005, "grad_norm": 185.0, "learning_rate": 1.72e-05, "loss": 95.7478, "loss/crossentropy": 8.873358249664307, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 6.897953987121582, "step": 8 }, { "epoch": 0.000625, "grad_norm": 163.0, "learning_rate": 1.9e-05, "loss": 92.634, "loss/crossentropy": 8.72307538986206, "loss/hidden": 16.375, "loss/jsd": 0.0, "loss/logits": 6.7046730518341064, "step": 10 }, { "epoch": 0.00075, "grad_norm": 153.0, "learning_rate": 2.0800000000000004e-05, "loss": 88.4929, "loss/crossentropy": 8.53145456314087, "loss/hidden": 16.3125, "loss/jsd": 0.0, "loss/logits": 6.585271596908569, "step": 12 }, { "epoch": 0.000875, "grad_norm": 153.0, "learning_rate": 2.2600000000000004e-05, "loss": 86.7945, "loss/crossentropy": 8.174943923950195, "loss/hidden": 16.25, "loss/jsd": 0.0, "loss/logits": 6.037625789642334, "step": 14 }, { "epoch": 0.001, "grad_norm": 121.0, "grad_norm_var": 14208.2, "learning_rate": 2.4400000000000004e-05, "loss": 81.4154, "loss/crossentropy": 7.956912994384766, "loss/hidden": 15.8125, "loss/jsd": 0.0, "loss/logits": 5.961349010467529, "step": 16 }, { "epoch": 0.001125, "grad_norm": 136.0, "grad_norm_var": 9968.116666666667, "learning_rate": 2.6200000000000003e-05, "loss": 80.922, "loss/crossentropy": 7.872050046920776, "loss/hidden": 15.34375, "loss/jsd": 0.0, "loss/logits": 5.787276268005371, "step": 18 }, { "epoch": 0.00125, "grad_norm": 64.5, "grad_norm_var": 8084.873958333334, "learning_rate": 2.8000000000000003e-05, "loss": 76.7299, "loss/crossentropy": 7.452724456787109, "loss/hidden": 15.21875, "loss/jsd": 0.0, "loss/logits": 5.1816017627716064, "step": 20 }, { "epoch": 0.001375, "grad_norm": 43.0, "grad_norm_var": 4135.966666666666, "learning_rate": 2.9800000000000006e-05, "loss": 74.1086, "loss/crossentropy": 7.244980812072754, "loss/hidden": 15.0625, "loss/jsd": 0.0, "loss/logits": 5.154205322265625, "step": 22 }, { "epoch": 0.0015, "grad_norm": 49.0, "grad_norm_var": 3930.31640625, "learning_rate": 3.16e-05, "loss": 71.8732, "loss/crossentropy": 7.059436321258545, "loss/hidden": 15.0, "loss/jsd": 0.0, "loss/logits": 4.9876039028167725, "step": 24 }, { "epoch": 0.001625, "grad_norm": 77.5, "grad_norm_var": 3864.5622395833334, "learning_rate": 3.3400000000000005e-05, "loss": 66.5866, "loss/crossentropy": 6.667392730712891, "loss/hidden": 14.90625, "loss/jsd": 0.0, "loss/logits": 4.435611724853516, "step": 26 }, { "epoch": 0.00175, "grad_norm": 72.0, "grad_norm_var": 3618.5559895833335, "learning_rate": 3.520000000000001e-05, "loss": 62.1951, "loss/crossentropy": 6.15596079826355, "loss/hidden": 14.15625, "loss/jsd": 0.0, "loss/logits": 4.280491590499878, "step": 28 }, { "epoch": 0.001875, "grad_norm": 66.5, "grad_norm_var": 3451.262239583333, "learning_rate": 3.7e-05, "loss": 59.1185, "loss/crossentropy": 5.833110809326172, "loss/hidden": 13.6875, "loss/jsd": 0.0, "loss/logits": 3.8512399196624756, "step": 30 }, { "epoch": 0.002, "grad_norm": 63.0, "grad_norm_var": 3467.72890625, "learning_rate": 3.88e-05, "loss": 54.6163, "loss/crossentropy": 5.594937324523926, "loss/hidden": 13.40625, "loss/jsd": 0.0, "loss/logits": 3.619442343711853, "step": 32 }, { "epoch": 0.002125, "grad_norm": 70.0, "grad_norm_var": 585.5997395833333, "learning_rate": 4.0600000000000004e-05, "loss": 49.8176, "loss/crossentropy": 5.316510438919067, "loss/hidden": 12.96875, "loss/jsd": 0.0, "loss/logits": 3.1636284589767456, "step": 34 }, { "epoch": 0.00225, "grad_norm": 54.25, "grad_norm_var": 208.6875, "learning_rate": 4.240000000000001e-05, "loss": 44.5006, "loss/crossentropy": 4.8605875968933105, "loss/hidden": 12.03125, "loss/jsd": 0.0, "loss/logits": 2.5045835971832275, "step": 36 }, { "epoch": 0.002375, "grad_norm": 43.0, "grad_norm_var": 205.99140625, "learning_rate": 4.420000000000001e-05, "loss": 41.8596, "loss/crossentropy": 4.582718849182129, "loss/hidden": 12.0, "loss/jsd": 0.0, "loss/logits": 2.616296648979187, "step": 38 }, { "epoch": 0.0025, "grad_norm": 39.75, "grad_norm_var": 217.99140625, "learning_rate": 4.600000000000001e-05, "loss": 38.6887, "loss/crossentropy": 4.197612762451172, "loss/hidden": 11.21875, "loss/jsd": 0.0, "loss/logits": 2.2158924341201782, "step": 40 }, { "epoch": 0.002625, "grad_norm": 57.75, "grad_norm_var": 215.47395833333334, "learning_rate": 4.78e-05, "loss": 35.5871, "loss/crossentropy": 4.077736258506775, "loss/hidden": 10.59375, "loss/jsd": 0.0, "loss/logits": 1.8813174366950989, "step": 42 }, { "epoch": 0.00275, "grad_norm": 33.75, "grad_norm_var": 158.04140625, "learning_rate": 4.96e-05, "loss": 33.6872, "loss/crossentropy": 4.088571310043335, "loss/hidden": 10.46875, "loss/jsd": 0.0, "loss/logits": 1.9159515500068665, "step": 44 }, { "epoch": 0.002875, "grad_norm": 27.25, "grad_norm_var": 173.02057291666668, "learning_rate": 5.14e-05, "loss": 31.5202, "loss/crossentropy": 3.7112059593200684, "loss/hidden": 10.09375, "loss/jsd": 0.0, "loss/logits": 1.8669533133506775, "step": 46 }, { "epoch": 0.003, "grad_norm": 30.625, "grad_norm_var": 153.88170572916667, "learning_rate": 5.3200000000000006e-05, "loss": 29.7576, "loss/crossentropy": 3.6459821462631226, "loss/hidden": 9.875, "loss/jsd": 0.0, "loss/logits": 1.7125096917152405, "step": 48 }, { "epoch": 0.003125, "grad_norm": 24.625, "grad_norm_var": 156.196875, "learning_rate": 5.500000000000001e-05, "loss": 28.6917, "loss/crossentropy": 3.534511685371399, "loss/hidden": 9.59375, "loss/jsd": 0.0, "loss/logits": 1.5916491150856018, "step": 50 }, { "epoch": 0.00325, "grad_norm": 26.0, "grad_norm_var": 119.83958333333334, "learning_rate": 5.680000000000001e-05, "loss": 27.6631, "loss/crossentropy": 3.3382843732833862, "loss/hidden": 9.25, "loss/jsd": 0.0, "loss/logits": 1.4924674034118652, "step": 52 }, { "epoch": 0.003375, "grad_norm": 21.25, "grad_norm_var": 231.38541666666666, "learning_rate": 5.860000000000001e-05, "loss": 26.1809, "loss/crossentropy": 3.426845669746399, "loss/hidden": 9.125, "loss/jsd": 0.0, "loss/logits": 1.3821245431900024, "step": 54 }, { "epoch": 0.0035, "grad_norm": 21.25, "grad_norm_var": 249.90358072916666, "learning_rate": 6.040000000000001e-05, "loss": 25.1479, "loss/crossentropy": 3.4015276432037354, "loss/hidden": 8.65625, "loss/jsd": 0.0, "loss/logits": 1.259027361869812, "step": 56 }, { "epoch": 0.003625, "grad_norm": 59.75, "grad_norm_var": 251.05416666666667, "learning_rate": 6.220000000000001e-05, "loss": 24.6783, "loss/crossentropy": 3.363521456718445, "loss/hidden": 8.5625, "loss/jsd": 0.0, "loss/logits": 1.218069314956665, "step": 58 }, { "epoch": 0.00375, "grad_norm": 14.9375, "grad_norm_var": 283.126806640625, "learning_rate": 6.400000000000001e-05, "loss": 23.6541, "loss/crossentropy": 3.4112290143966675, "loss/hidden": 8.5, "loss/jsd": 0.0, "loss/logits": 1.222625195980072, "step": 60 }, { "epoch": 0.003875, "grad_norm": 21.25, "grad_norm_var": 301.22120768229166, "learning_rate": 6.58e-05, "loss": 22.9688, "loss/crossentropy": 3.150188446044922, "loss/hidden": 8.28125, "loss/jsd": 0.0, "loss/logits": 1.157865822315216, "step": 62 }, { "epoch": 0.004, "grad_norm": 20.25, "grad_norm_var": 336.315087890625, "learning_rate": 6.76e-05, "loss": 22.4484, "loss/crossentropy": 3.2142295837402344, "loss/hidden": 8.078125, "loss/jsd": 0.0, "loss/logits": 1.1245205402374268, "step": 64 }, { "epoch": 0.004125, "grad_norm": 20.25, "grad_norm_var": 350.4535807291667, "learning_rate": 6.94e-05, "loss": 21.3778, "loss/crossentropy": 3.3086668252944946, "loss/hidden": 7.78125, "loss/jsd": 0.0, "loss/logits": 1.0809211134910583, "step": 66 }, { "epoch": 0.00425, "grad_norm": 17.75, "grad_norm_var": 363.0113932291667, "learning_rate": 7.120000000000001e-05, "loss": 20.7697, "loss/crossentropy": 3.0438809394836426, "loss/hidden": 7.578125, "loss/jsd": 0.0, "loss/logits": 0.9832420945167542, "step": 68 }, { "epoch": 0.004375, "grad_norm": 16.625, "grad_norm_var": 176.550634765625, "learning_rate": 7.3e-05, "loss": 20.7695, "loss/crossentropy": 3.005946159362793, "loss/hidden": 7.609375, "loss/jsd": 0.0, "loss/logits": 0.979515790939331, "step": 70 }, { "epoch": 0.0045, "grad_norm": 14.6875, "grad_norm_var": 182.58118489583333, "learning_rate": 7.48e-05, "loss": 20.2026, "loss/crossentropy": 3.1091376543045044, "loss/hidden": 7.46875, "loss/jsd": 0.0, "loss/logits": 0.994384378194809, "step": 72 }, { "epoch": 0.004625, "grad_norm": 19.75, "grad_norm_var": 81.90494791666667, "learning_rate": 7.66e-05, "loss": 19.8715, "loss/crossentropy": 3.172377586364746, "loss/hidden": 7.359375, "loss/jsd": 0.0, "loss/logits": 1.0095622539520264, "step": 74 }, { "epoch": 0.00475, "grad_norm": 17.0, "grad_norm_var": 81.86875, "learning_rate": 7.840000000000001e-05, "loss": 19.2123, "loss/crossentropy": 2.75563645362854, "loss/hidden": 7.125, "loss/jsd": 0.0, "loss/logits": 0.8763986825942993, "step": 76 }, { "epoch": 0.004875, "grad_norm": 15.9375, "grad_norm_var": 84.11599934895834, "learning_rate": 8.020000000000001e-05, "loss": 19.1961, "loss/crossentropy": 2.981382369995117, "loss/hidden": 7.328125, "loss/jsd": 0.0, "loss/logits": 0.9651070237159729, "step": 78 }, { "epoch": 0.005, "grad_norm": 15.1875, "grad_norm_var": 2.9731770833333333, "learning_rate": 8.200000000000001e-05, "loss": 18.5459, "loss/crossentropy": 3.1191996335983276, "loss/hidden": 7.1875, "loss/jsd": 0.0, "loss/logits": 0.8418412506580353, "step": 80 }, { "epoch": 0.005125, "grad_norm": 14.6875, "grad_norm_var": 2.395166015625, "learning_rate": 8.38e-05, "loss": 18.149, "loss/crossentropy": 2.564948797225952, "loss/hidden": 6.9375, "loss/jsd": 0.0, "loss/logits": 0.8517245650291443, "step": 82 }, { "epoch": 0.00525, "grad_norm": 16.0, "grad_norm_var": 2.08125, "learning_rate": 8.560000000000001e-05, "loss": 18.4829, "loss/crossentropy": 3.1382123231887817, "loss/hidden": 7.0625, "loss/jsd": 0.0, "loss/logits": 0.9703748524188995, "step": 84 }, { "epoch": 0.005375, "grad_norm": 15.9375, "grad_norm_var": 2.0563639322916667, "learning_rate": 8.740000000000001e-05, "loss": 17.9624, "loss/crossentropy": 3.091128706932068, "loss/hidden": 6.75, "loss/jsd": 0.0, "loss/logits": 0.8826551735401154, "step": 86 }, { "epoch": 0.0055, "grad_norm": 15.25, "grad_norm_var": 1.9325358072916667, "learning_rate": 8.92e-05, "loss": 17.5696, "loss/crossentropy": 2.9899988174438477, "loss/hidden": 6.65625, "loss/jsd": 0.0, "loss/logits": 0.7654303312301636, "step": 88 }, { "epoch": 0.005625, "grad_norm": 16.25, "grad_norm_var": 0.789306640625, "learning_rate": 9.1e-05, "loss": 17.0042, "loss/crossentropy": 2.8025410175323486, "loss/hidden": 6.65625, "loss/jsd": 0.0, "loss/logits": 0.7915366590023041, "step": 90 }, { "epoch": 0.00575, "grad_norm": 13.625, "grad_norm_var": 2.162744140625, "learning_rate": 9.28e-05, "loss": 17.2984, "loss/crossentropy": 3.013433814048767, "loss/hidden": 6.5625, "loss/jsd": 0.0, "loss/logits": 0.8098262250423431, "step": 92 }, { "epoch": 0.005875, "grad_norm": 16.5, "grad_norm_var": 2.249739583333333, "learning_rate": 9.46e-05, "loss": 17.1342, "loss/crossentropy": 2.912646174430847, "loss/hidden": 6.453125, "loss/jsd": 0.0, "loss/logits": 0.8038456439971924, "step": 94 }, { "epoch": 0.006, "grad_norm": 11.625, "grad_norm_var": 2.714306640625, "learning_rate": 9.64e-05, "loss": 16.7369, "loss/crossentropy": 2.9029338359832764, "loss/hidden": 6.453125, "loss/jsd": 0.0, "loss/logits": 0.7524089217185974, "step": 96 }, { "epoch": 0.006125, "grad_norm": 12.375, "grad_norm_var": 4.069124348958334, "learning_rate": 9.82e-05, "loss": 16.4013, "loss/crossentropy": 2.8420186042785645, "loss/hidden": 6.390625, "loss/jsd": 0.0, "loss/logits": 0.7496578097343445, "step": 98 }, { "epoch": 0.00625, "grad_norm": 12.5, "grad_norm_var": 3.99140625, "learning_rate": 0.0001, "loss": 16.5129, "loss/crossentropy": 2.7955269813537598, "loss/hidden": 6.34375, "loss/jsd": 0.0, "loss/logits": 0.7174519896507263, "step": 100 }, { "epoch": 0.006375, "grad_norm": 11.6875, "grad_norm_var": 3.717431640625, "learning_rate": 0.0001, "loss": 16.1005, "loss/crossentropy": 2.890980839729309, "loss/hidden": 6.3125, "loss/jsd": 0.0, "loss/logits": 0.7031005620956421, "step": 102 }, { "epoch": 0.0065, "grad_norm": 10.1875, "grad_norm_var": 4.207796223958334, "learning_rate": 0.0001, "loss": 16.0555, "loss/crossentropy": 2.8553627729415894, "loss/hidden": 6.1875, "loss/jsd": 0.0, "loss/logits": 0.6933196187019348, "step": 104 }, { "epoch": 0.006625, "grad_norm": 15.0625, "grad_norm_var": 3.723372395833333, "learning_rate": 0.0001, "loss": 16.25, "loss/crossentropy": 2.7866374254226685, "loss/hidden": 6.140625, "loss/jsd": 0.0, "loss/logits": 0.6986292898654938, "step": 106 }, { "epoch": 0.00675, "grad_norm": 12.25, "grad_norm_var": 3.645556640625, "learning_rate": 0.0001, "loss": 16.4036, "loss/crossentropy": 2.8795191049575806, "loss/hidden": 6.203125, "loss/jsd": 0.0, "loss/logits": 0.6980823576450348, "step": 108 }, { "epoch": 0.006875, "grad_norm": 11.3125, "grad_norm_var": 2.959830729166667, "learning_rate": 0.0001, "loss": 15.5029, "loss/crossentropy": 2.6974622011184692, "loss/hidden": 6.265625, "loss/jsd": 0.0, "loss/logits": 0.7379841208457947, "step": 110 }, { "epoch": 0.007, "grad_norm": 14.375, "grad_norm_var": 2.8355305989583335, "learning_rate": 0.0001, "loss": 15.5842, "loss/crossentropy": 2.5517276525497437, "loss/hidden": 6.0, "loss/jsd": 0.0, "loss/logits": 0.6455385684967041, "step": 112 }, { "epoch": 0.007125, "grad_norm": 12.6875, "grad_norm_var": 2.512093098958333, "learning_rate": 0.0001, "loss": 15.6556, "loss/crossentropy": 2.710301995277405, "loss/hidden": 5.84375, "loss/jsd": 0.0, "loss/logits": 0.7062334418296814, "step": 114 }, { "epoch": 0.00725, "grad_norm": 13.75, "grad_norm_var": 2.6353515625, "learning_rate": 0.0001, "loss": 15.3746, "loss/crossentropy": 2.9104617834091187, "loss/hidden": 5.859375, "loss/jsd": 0.0, "loss/logits": 0.6806878745555878, "step": 116 }, { "epoch": 0.007375, "grad_norm": 10.8125, "grad_norm_var": 2.628059895833333, "learning_rate": 0.0001, "loss": 15.3926, "loss/crossentropy": 2.7969307899475098, "loss/hidden": 5.890625, "loss/jsd": 0.0, "loss/logits": 0.7158068418502808, "step": 118 }, { "epoch": 0.0075, "grad_norm": 11.375, "grad_norm_var": 2.486962890625, "learning_rate": 0.0001, "loss": 15.4671, "loss/crossentropy": 2.758000612258911, "loss/hidden": 6.125, "loss/jsd": 0.0, "loss/logits": 0.692883163690567, "step": 120 }, { "epoch": 0.007625, "grad_norm": 11.9375, "grad_norm_var": 2.400764973958333, "learning_rate": 0.0001, "loss": 15.5392, "loss/crossentropy": 2.814534068107605, "loss/hidden": 5.75, "loss/jsd": 0.0, "loss/logits": 0.669840395450592, "step": 122 }, { "epoch": 0.00775, "grad_norm": 10.875, "grad_norm_var": 1.5585774739583333, "learning_rate": 0.0001, "loss": 15.0821, "loss/crossentropy": 2.4222623109817505, "loss/hidden": 5.8125, "loss/jsd": 0.0, "loss/logits": 0.6296161711215973, "step": 124 }, { "epoch": 0.007875, "grad_norm": 9.375, "grad_norm_var": 2.016650390625, "learning_rate": 0.0001, "loss": 14.7169, "loss/crossentropy": 2.8036348819732666, "loss/hidden": 5.6875, "loss/jsd": 0.0, "loss/logits": 0.5952793657779694, "step": 126 }, { "epoch": 0.008, "grad_norm": 19.5, "grad_norm_var": 5.5265625, "learning_rate": 0.0001, "loss": 15.6228, "loss/crossentropy": 2.871894598007202, "loss/hidden": 5.9375, "loss/jsd": 0.0, "loss/logits": 0.8850542902946472, "step": 128 }, { "epoch": 0.008125, "grad_norm": 13.875, "grad_norm_var": 5.818489583333333, "learning_rate": 0.0001, "loss": 15.3871, "loss/crossentropy": 2.7824504375457764, "loss/hidden": 5.84375, "loss/jsd": 0.0, "loss/logits": 0.6649284958839417, "step": 130 }, { "epoch": 0.00825, "grad_norm": 9.75, "grad_norm_var": 6.017122395833334, "learning_rate": 0.0001, "loss": 14.802, "loss/crossentropy": 2.7720154523849487, "loss/hidden": 5.640625, "loss/jsd": 0.0, "loss/logits": 0.6440402269363403, "step": 132 }, { "epoch": 0.008375, "grad_norm": 12.0625, "grad_norm_var": 5.979947916666666, "learning_rate": 0.0001, "loss": 14.896, "loss/crossentropy": 2.4699759483337402, "loss/hidden": 5.65625, "loss/jsd": 0.0, "loss/logits": 0.631425142288208, "step": 134 }, { "epoch": 0.0085, "grad_norm": 11.8125, "grad_norm_var": 6.357747395833333, "learning_rate": 0.0001, "loss": 14.7694, "loss/crossentropy": 2.9012371301651, "loss/hidden": 5.796875, "loss/jsd": 0.0, "loss/logits": 0.709007978439331, "step": 136 }, { "epoch": 0.008625, "grad_norm": 10.625, "grad_norm_var": 6.092171223958333, "learning_rate": 0.0001, "loss": 14.6342, "loss/crossentropy": 2.7152575254440308, "loss/hidden": 5.703125, "loss/jsd": 0.0, "loss/logits": 0.6243754923343658, "step": 138 }, { "epoch": 0.00875, "grad_norm": 8.6875, "grad_norm_var": 6.715104166666666, "learning_rate": 0.0001, "loss": 14.6597, "loss/crossentropy": 2.5907901525497437, "loss/hidden": 5.578125, "loss/jsd": 0.0, "loss/logits": 0.6299647688865662, "step": 140 }, { "epoch": 0.008875, "grad_norm": 8.1875, "grad_norm_var": 7.1994140625, "learning_rate": 0.0001, "loss": 13.9556, "loss/crossentropy": 2.699749708175659, "loss/hidden": 5.515625, "loss/jsd": 0.0, "loss/logits": 0.5889811217784882, "step": 142 }, { "epoch": 0.009, "grad_norm": 8.5, "grad_norm_var": 3.0494791666666665, "learning_rate": 0.0001, "loss": 13.9315, "loss/crossentropy": 2.455536365509033, "loss/hidden": 5.515625, "loss/jsd": 0.0, "loss/logits": 0.5711115598678589, "step": 144 }, { "epoch": 0.009125, "grad_norm": 11.6875, "grad_norm_var": 1.7934895833333333, "learning_rate": 0.0001, "loss": 14.3392, "loss/crossentropy": 2.5603734254837036, "loss/hidden": 5.515625, "loss/jsd": 0.0, "loss/logits": 0.6282104849815369, "step": 146 }, { "epoch": 0.00925, "grad_norm": 9.9375, "grad_norm_var": 1.81640625, "learning_rate": 0.0001, "loss": 14.155, "loss/crossentropy": 2.7742687463760376, "loss/hidden": 5.546875, "loss/jsd": 0.0, "loss/logits": 0.5838975608348846, "step": 148 }, { "epoch": 0.009375, "grad_norm": 7.28125, "grad_norm_var": 1.9861287434895833, "learning_rate": 0.0001, "loss": 14.2792, "loss/crossentropy": 2.50667142868042, "loss/hidden": 5.515625, "loss/jsd": 0.0, "loss/logits": 0.547415554523468, "step": 150 }, { "epoch": 0.0095, "grad_norm": 12.5, "grad_norm_var": 2.164969889322917, "learning_rate": 0.0001, "loss": 14.4224, "loss/crossentropy": 2.701486349105835, "loss/hidden": 5.4375, "loss/jsd": 0.0, "loss/logits": 0.567545473575592, "step": 152 }, { "epoch": 0.009625, "grad_norm": 10.0, "grad_norm_var": 1.96607666015625, "learning_rate": 0.0001, "loss": 13.5836, "loss/crossentropy": 2.611671805381775, "loss/hidden": 5.375, "loss/jsd": 0.0, "loss/logits": 0.5661961734294891, "step": 154 }, { "epoch": 0.00975, "grad_norm": 9.125, "grad_norm_var": 1.7756795247395833, "learning_rate": 0.0001, "loss": 13.775, "loss/crossentropy": 2.598029851913452, "loss/hidden": 5.359375, "loss/jsd": 0.0, "loss/logits": 0.5618497729301453, "step": 156 }, { "epoch": 0.009875, "grad_norm": 7.9375, "grad_norm_var": 1.92681884765625, "learning_rate": 0.0001, "loss": 13.5367, "loss/crossentropy": 2.6330727338790894, "loss/hidden": 5.40625, "loss/jsd": 0.0, "loss/logits": 0.5757810473442078, "step": 158 }, { "epoch": 0.01, "grad_norm": 9.375, "grad_norm_var": 1.8233683268229166, "learning_rate": 0.0001, "loss": 13.7555, "loss/crossentropy": 2.6642661094665527, "loss/hidden": 5.4375, "loss/jsd": 0.0, "loss/logits": 0.5554981827735901, "step": 160 }, { "epoch": 0.010125, "grad_norm": 7.53125, "grad_norm_var": 1.7612630208333333, "learning_rate": 0.0001, "loss": 13.4486, "loss/crossentropy": 2.670701742172241, "loss/hidden": 5.359375, "loss/jsd": 0.0, "loss/logits": 0.5661377012729645, "step": 162 }, { "epoch": 0.01025, "grad_norm": 10.25, "grad_norm_var": 1.9761678059895833, "learning_rate": 0.0001, "loss": 13.2185, "loss/crossentropy": 2.4556703567504883, "loss/hidden": 5.390625, "loss/jsd": 0.0, "loss/logits": 0.5137010216712952, "step": 164 }, { "epoch": 0.010375, "grad_norm": 7.1875, "grad_norm_var": 1.959228515625, "learning_rate": 0.0001, "loss": 13.2531, "loss/crossentropy": 2.7351680994033813, "loss/hidden": 5.328125, "loss/jsd": 0.0, "loss/logits": 0.5712402760982513, "step": 166 }, { "epoch": 0.0105, "grad_norm": 10.75, "grad_norm_var": 1.699072265625, "learning_rate": 0.0001, "loss": 13.6734, "loss/crossentropy": 2.588118314743042, "loss/hidden": 5.359375, "loss/jsd": 0.0, "loss/logits": 0.5905281007289886, "step": 168 }, { "epoch": 0.010625, "grad_norm": 9.1875, "grad_norm_var": 1.7223795572916667, "learning_rate": 0.0001, "loss": 13.5448, "loss/crossentropy": 2.7176616191864014, "loss/hidden": 5.390625, "loss/jsd": 0.0, "loss/logits": 0.5851459503173828, "step": 170 }, { "epoch": 0.01075, "grad_norm": 7.21875, "grad_norm_var": 2.4208943684895834, "learning_rate": 0.0001, "loss": 13.6468, "loss/crossentropy": 2.6239798069000244, "loss/hidden": 5.234375, "loss/jsd": 0.0, "loss/logits": 0.5490683615207672, "step": 172 }, { "epoch": 0.010875, "grad_norm": 7.625, "grad_norm_var": 2.434273274739583, "learning_rate": 0.0001, "loss": 13.0877, "loss/crossentropy": 2.648572325706482, "loss/hidden": 5.203125, "loss/jsd": 0.0, "loss/logits": 0.5484789907932281, "step": 174 }, { "epoch": 0.011, "grad_norm": 8.5, "grad_norm_var": 2.4641764322916666, "learning_rate": 0.0001, "loss": 12.9536, "loss/crossentropy": 2.6549497842788696, "loss/hidden": 5.171875, "loss/jsd": 0.0, "loss/logits": 0.5412751138210297, "step": 176 }, { "epoch": 0.011125, "grad_norm": 7.84375, "grad_norm_var": 2.4390462239583335, "learning_rate": 0.0001, "loss": 13.4435, "loss/crossentropy": 2.421927332878113, "loss/hidden": 5.15625, "loss/jsd": 0.0, "loss/logits": 0.555980384349823, "step": 178 }, { "epoch": 0.01125, "grad_norm": 8.5625, "grad_norm_var": 2.372119140625, "learning_rate": 0.0001, "loss": 13.2014, "loss/crossentropy": 2.680444836616516, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.4972950965166092, "step": 180 }, { "epoch": 0.011375, "grad_norm": 11.0, "grad_norm_var": 73.80558268229167, "learning_rate": 0.0001, "loss": 13.4406, "loss/crossentropy": 2.49616801738739, "loss/hidden": 5.296875, "loss/jsd": 0.0, "loss/logits": 0.5274538397789001, "step": 182 }, { "epoch": 0.0115, "grad_norm": 9.3125, "grad_norm_var": 74.92745768229166, "learning_rate": 0.0001, "loss": 12.7807, "loss/crossentropy": 2.4947917461395264, "loss/hidden": 5.0625, "loss/jsd": 0.0, "loss/logits": 0.5059804916381836, "step": 184 }, { "epoch": 0.011625, "grad_norm": 8.125, "grad_norm_var": 74.739697265625, "learning_rate": 0.0001, "loss": 12.7996, "loss/crossentropy": 2.5512574911117554, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.49534276127815247, "step": 186 }, { "epoch": 0.01175, "grad_norm": 6.90625, "grad_norm_var": 75.45623372395833, "learning_rate": 0.0001, "loss": 12.934, "loss/crossentropy": 2.630277991294861, "loss/hidden": 5.140625, "loss/jsd": 0.0, "loss/logits": 0.5003396719694138, "step": 188 }, { "epoch": 0.011875, "grad_norm": 7.625, "grad_norm_var": 76.01287434895833, "learning_rate": 0.0001, "loss": 12.9007, "loss/crossentropy": 2.5264713764190674, "loss/hidden": 5.015625, "loss/jsd": 0.0, "loss/logits": 0.5094788670539856, "step": 190 }, { "epoch": 0.012, "grad_norm": 7.71875, "grad_norm_var": 76.15037434895834, "learning_rate": 0.0001, "loss": 13.0658, "loss/crossentropy": 2.704426646232605, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.522599458694458, "step": 192 }, { "epoch": 0.012125, "grad_norm": 8.1875, "grad_norm_var": 76.128125, "learning_rate": 0.0001, "loss": 12.9704, "loss/crossentropy": 2.702088713645935, "loss/hidden": 5.09375, "loss/jsd": 0.0, "loss/logits": 0.5415545701980591, "step": 194 }, { "epoch": 0.01225, "grad_norm": 6.8125, "grad_norm_var": 76.61405843098959, "learning_rate": 0.0001, "loss": 12.7, "loss/crossentropy": 2.705085873603821, "loss/hidden": 5.0625, "loss/jsd": 0.0, "loss/logits": 0.5454063713550568, "step": 196 }, { "epoch": 0.012375, "grad_norm": 8.25, "grad_norm_var": 0.5269368489583334, "learning_rate": 0.0001, "loss": 12.8295, "loss/crossentropy": 2.5774309635162354, "loss/hidden": 5.015625, "loss/jsd": 0.0, "loss/logits": 0.5220803320407867, "step": 198 }, { "epoch": 0.0125, "grad_norm": 6.90625, "grad_norm_var": 0.41435139973958335, "learning_rate": 0.0001, "loss": 12.6159, "loss/crossentropy": 2.1811429262161255, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.4608597755432129, "step": 200 }, { "epoch": 0.012625, "grad_norm": 7.46875, "grad_norm_var": 0.252587890625, "learning_rate": 0.0001, "loss": 12.4811, "loss/crossentropy": 2.6321566104888916, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.5277400612831116, "step": 202 }, { "epoch": 0.01275, "grad_norm": 8.125, "grad_norm_var": 0.29940999348958336, "learning_rate": 0.0001, "loss": 12.7234, "loss/crossentropy": 2.4674800634384155, "loss/hidden": 4.984375, "loss/jsd": 0.0, "loss/logits": 0.4902832508087158, "step": 204 }, { "epoch": 0.012875, "grad_norm": 7.0625, "grad_norm_var": 0.3092732747395833, "learning_rate": 0.0001, "loss": 12.6082, "loss/crossentropy": 2.418899178504944, "loss/hidden": 5.078125, "loss/jsd": 0.0, "loss/logits": 0.44220657646656036, "step": 206 }, { "epoch": 0.013, "grad_norm": 8.1875, "grad_norm_var": 0.31503499348958336, "learning_rate": 0.0001, "loss": 12.2736, "loss/crossentropy": 2.5233161449432373, "loss/hidden": 5.0, "loss/jsd": 0.0, "loss/logits": 0.4981265068054199, "step": 208 }, { "epoch": 0.013125, "grad_norm": 7.71875, "grad_norm_var": 0.35037434895833336, "learning_rate": 0.0001, "loss": 12.4505, "loss/crossentropy": 2.5429080724716187, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.49663229286670685, "step": 210 }, { "epoch": 0.01325, "grad_norm": 7.96875, "grad_norm_var": 0.29889322916666666, "learning_rate": 0.0001, "loss": 12.9271, "loss/crossentropy": 2.7774670124053955, "loss/hidden": 4.96875, "loss/jsd": 0.0, "loss/logits": 0.5223149955272675, "step": 212 }, { "epoch": 0.013375, "grad_norm": 6.5625, "grad_norm_var": 0.37265218098958336, "learning_rate": 0.0001, "loss": 12.2444, "loss/crossentropy": 2.550992965698242, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.4571031928062439, "step": 214 }, { "epoch": 0.0135, "grad_norm": 7.53125, "grad_norm_var": 0.29010009765625, "learning_rate": 0.0001, "loss": 12.1625, "loss/crossentropy": 2.2462257146835327, "loss/hidden": 4.875, "loss/jsd": 0.0, "loss/logits": 0.48005372285842896, "step": 216 }, { "epoch": 0.013625, "grad_norm": 7.875, "grad_norm_var": 0.298046875, "learning_rate": 0.0001, "loss": 12.3998, "loss/crossentropy": 2.617794990539551, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.5356446206569672, "step": 218 }, { "epoch": 0.01375, "grad_norm": 6.84375, "grad_norm_var": 0.21796875, "learning_rate": 0.0001, "loss": 12.4159, "loss/crossentropy": 2.432363748550415, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.5167964398860931, "step": 220 }, { "epoch": 0.013875, "grad_norm": 7.71875, "grad_norm_var": 0.21158447265625, "learning_rate": 0.0001, "loss": 12.267, "loss/crossentropy": 2.5088655948638916, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.5033310353755951, "step": 222 }, { "epoch": 0.014, "grad_norm": 8.6875, "grad_norm_var": 0.27616780598958335, "learning_rate": 0.0001, "loss": 12.5268, "loss/crossentropy": 2.463010787963867, "loss/hidden": 4.953125, "loss/jsd": 0.0, "loss/logits": 0.5101533681154251, "step": 224 }, { "epoch": 0.014125, "grad_norm": 7.40625, "grad_norm_var": 0.2908203125, "learning_rate": 0.0001, "loss": 12.4375, "loss/crossentropy": 2.7142781019210815, "loss/hidden": 4.96875, "loss/jsd": 0.0, "loss/logits": 0.48157520592212677, "step": 226 }, { "epoch": 0.01425, "grad_norm": 6.84375, "grad_norm_var": 0.36500244140625, "learning_rate": 0.0001, "loss": 12.1223, "loss/crossentropy": 2.696410059928894, "loss/hidden": 4.859375, "loss/jsd": 0.0, "loss/logits": 0.4689347445964813, "step": 228 }, { "epoch": 0.014375, "grad_norm": 7.84375, "grad_norm_var": 0.3846354166666667, "learning_rate": 0.0001, "loss": 12.2157, "loss/crossentropy": 2.5104721784591675, "loss/hidden": 4.84375, "loss/jsd": 0.0, "loss/logits": 0.4914693534374237, "step": 230 }, { "epoch": 0.0145, "grad_norm": 6.875, "grad_norm_var": 0.41243082682291665, "learning_rate": 0.0001, "loss": 12.3422, "loss/crossentropy": 2.638755679130554, "loss/hidden": 4.953125, "loss/jsd": 0.0, "loss/logits": 0.5526512563228607, "step": 232 }, { "epoch": 0.014625, "grad_norm": 6.1875, "grad_norm_var": 0.521337890625, "learning_rate": 0.0001, "loss": 12.0554, "loss/crossentropy": 2.48711097240448, "loss/hidden": 4.828125, "loss/jsd": 0.0, "loss/logits": 0.4476258456707001, "step": 234 }, { "epoch": 0.01475, "grad_norm": 8.375, "grad_norm_var": 0.54586181640625, "learning_rate": 0.0001, "loss": 12.2178, "loss/crossentropy": 2.5519078969955444, "loss/hidden": 4.96875, "loss/jsd": 0.0, "loss/logits": 0.456609308719635, "step": 236 }, { "epoch": 0.014875, "grad_norm": 7.125, "grad_norm_var": 0.563671875, "learning_rate": 0.0001, "loss": 12.0904, "loss/crossentropy": 2.4752217531204224, "loss/hidden": 4.84375, "loss/jsd": 0.0, "loss/logits": 0.4934917986392975, "step": 238 }, { "epoch": 0.015, "grad_norm": 6.34375, "grad_norm_var": 0.5327962239583334, "learning_rate": 0.0001, "loss": 12.3964, "loss/crossentropy": 2.834780216217041, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.5274220705032349, "step": 240 }, { "epoch": 0.015125, "grad_norm": 10.1875, "grad_norm_var": 1.0213826497395833, "learning_rate": 0.0001, "loss": 12.6241, "loss/crossentropy": 2.6608060598373413, "loss/hidden": 4.734375, "loss/jsd": 0.0, "loss/logits": 0.5230352878570557, "step": 242 }, { "epoch": 0.01525, "grad_norm": 9.0625, "grad_norm_var": 1.1087890625, "learning_rate": 0.0001, "loss": 12.4601, "loss/crossentropy": 2.5716851949691772, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.5314892381429672, "step": 244 }, { "epoch": 0.015375, "grad_norm": 7.46875, "grad_norm_var": 1.082666015625, "learning_rate": 0.0001, "loss": 12.2352, "loss/crossentropy": 2.655149459838867, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.4710581600666046, "step": 246 }, { "epoch": 0.0155, "grad_norm": 5.65625, "grad_norm_var": 1.328515625, "learning_rate": 0.0001, "loss": 11.692, "loss/crossentropy": 2.1561089754104614, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.45925989747047424, "step": 248 }, { "epoch": 0.015625, "grad_norm": 6.75, "grad_norm_var": 1.30445556640625, "learning_rate": 0.0001, "loss": 11.7933, "loss/crossentropy": 2.410550117492676, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.4565409719944, "step": 250 }, { "epoch": 0.01575, "grad_norm": 6.84375, "grad_norm_var": 1.2310831705729166, "learning_rate": 0.0001, "loss": 11.88, "loss/crossentropy": 2.586890459060669, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.4949754476547241, "step": 252 }, { "epoch": 0.015875, "grad_norm": 6.4375, "grad_norm_var": 1.3586873372395833, "learning_rate": 0.0001, "loss": 12.0459, "loss/crossentropy": 2.446923851966858, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.46594707667827606, "step": 254 }, { "epoch": 0.016, "grad_norm": 6.40625, "grad_norm_var": 1.3482381184895833, "learning_rate": 0.0001, "loss": 12.1629, "loss/crossentropy": 2.57769775390625, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.4595105051994324, "step": 256 }, { "epoch": 0.016125, "grad_norm": 6.625, "grad_norm_var": 0.7177734375, "learning_rate": 0.0001, "loss": 12.0016, "loss/crossentropy": 2.37365186214447, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.4366963058710098, "step": 258 }, { "epoch": 0.01625, "grad_norm": 6.1875, "grad_norm_var": 0.43580322265625, "learning_rate": 0.0001, "loss": 11.9528, "loss/crossentropy": 2.3685457706451416, "loss/hidden": 4.75, "loss/jsd": 0.0, "loss/logits": 0.45073819160461426, "step": 260 }, { "epoch": 0.016375, "grad_norm": 6.8125, "grad_norm_var": 0.26324462890625, "learning_rate": 0.0001, "loss": 12.0193, "loss/crossentropy": 2.349572777748108, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.4415210783481598, "step": 262 }, { "epoch": 0.0165, "grad_norm": 6.4375, "grad_norm_var": 0.204150390625, "learning_rate": 0.0001, "loss": 11.828, "loss/crossentropy": 2.376862049102783, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.4309367835521698, "step": 264 }, { "epoch": 0.016625, "grad_norm": 6.6875, "grad_norm_var": 0.20725504557291666, "learning_rate": 0.0001, "loss": 11.6319, "loss/crossentropy": 2.316063165664673, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.43444499373435974, "step": 266 }, { "epoch": 0.01675, "grad_norm": 5.5625, "grad_norm_var": 0.18316650390625, "learning_rate": 0.0001, "loss": 11.7038, "loss/crossentropy": 2.402170777320862, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.44065244495868683, "step": 268 }, { "epoch": 0.016875, "grad_norm": 6.5625, "grad_norm_var": 0.17633056640625, "learning_rate": 0.0001, "loss": 11.6722, "loss/crossentropy": 2.5016753673553467, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.4539669454097748, "step": 270 }, { "epoch": 0.017, "grad_norm": 6.34375, "grad_norm_var": 0.15442708333333333, "learning_rate": 0.0001, "loss": 11.756, "loss/crossentropy": 2.344158887863159, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.46556878089904785, "step": 272 }, { "epoch": 0.017125, "grad_norm": 6.25, "grad_norm_var": 0.18977864583333334, "learning_rate": 0.0001, "loss": 11.4672, "loss/crossentropy": 2.3533241748809814, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.42739230394363403, "step": 274 }, { "epoch": 0.01725, "grad_norm": 8.0625, "grad_norm_var": 0.48045247395833335, "learning_rate": 0.0001, "loss": 11.7908, "loss/crossentropy": 2.360278367996216, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.46030762791633606, "step": 276 }, { "epoch": 0.017375, "grad_norm": 6.5625, "grad_norm_var": 0.45963134765625, "learning_rate": 0.0001, "loss": 11.8649, "loss/crossentropy": 2.433812379837036, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.4665477126836777, "step": 278 }, { "epoch": 0.0175, "grad_norm": 5.34375, "grad_norm_var": 0.56021728515625, "learning_rate": 0.0001, "loss": 11.8492, "loss/crossentropy": 2.4910370111465454, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.4576037526130676, "step": 280 }, { "epoch": 0.017625, "grad_norm": 7.4375, "grad_norm_var": 0.601171875, "learning_rate": 0.0001, "loss": 11.7673, "loss/crossentropy": 2.730518341064453, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.5011050552129745, "step": 282 }, { "epoch": 0.01775, "grad_norm": 6.03125, "grad_norm_var": 0.5543904622395833, "learning_rate": 0.0001, "loss": 11.739, "loss/crossentropy": 2.53789222240448, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.44618333876132965, "step": 284 }, { "epoch": 0.017875, "grad_norm": 6.3125, "grad_norm_var": 0.5624308268229167, "learning_rate": 0.0001, "loss": 12.0365, "loss/crossentropy": 2.5072152614593506, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.4584163427352905, "step": 286 }, { "epoch": 0.018, "grad_norm": 7.0, "grad_norm_var": 0.56744384765625, "learning_rate": 0.0001, "loss": 11.7216, "loss/crossentropy": 2.713698387145996, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.47771963477134705, "step": 288 }, { "epoch": 0.018125, "grad_norm": 5.34375, "grad_norm_var": 0.5739420572916667, "learning_rate": 0.0001, "loss": 11.3088, "loss/crossentropy": 2.4359713792800903, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.4259905219078064, "step": 290 }, { "epoch": 0.01825, "grad_norm": 7.5, "grad_norm_var": 0.36584879557291666, "learning_rate": 0.0001, "loss": 11.7531, "loss/crossentropy": 2.5594223737716675, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.4502502828836441, "step": 292 }, { "epoch": 0.018375, "grad_norm": 10.1875, "grad_norm_var": 1.243994140625, "learning_rate": 0.0001, "loss": 12.587, "loss/crossentropy": 2.979864239692688, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.6050755679607391, "step": 294 }, { "epoch": 0.0185, "grad_norm": 6.125, "grad_norm_var": 3.3922159830729166, "learning_rate": 0.0001, "loss": 12.0734, "loss/crossentropy": 2.6472045183181763, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.4378223419189453, "step": 296 }, { "epoch": 0.018625, "grad_norm": 6.125, "grad_norm_var": 3.450972493489583, "learning_rate": 0.0001, "loss": 11.4956, "loss/crossentropy": 2.5419256687164307, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.4189437925815582, "step": 298 }, { "epoch": 0.01875, "grad_norm": 6.4375, "grad_norm_var": 3.3952433268229165, "learning_rate": 0.0001, "loss": 11.4754, "loss/crossentropy": 2.4922432899475098, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.4700406640768051, "step": 300 }, { "epoch": 0.018875, "grad_norm": 5.8125, "grad_norm_var": 3.46099853515625, "learning_rate": 0.0001, "loss": 11.4791, "loss/crossentropy": 2.1792010068893433, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.4193985015153885, "step": 302 }, { "epoch": 0.019, "grad_norm": 5.8125, "grad_norm_var": 3.5378743489583333, "learning_rate": 0.0001, "loss": 11.4713, "loss/crossentropy": 2.5582441091537476, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.45763692259788513, "step": 304 }, { "epoch": 0.019125, "grad_norm": 6.78125, "grad_norm_var": 3.42242431640625, "learning_rate": 0.0001, "loss": 11.5659, "loss/crossentropy": 2.501905083656311, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.43473342061042786, "step": 306 }, { "epoch": 0.01925, "grad_norm": 5.9375, "grad_norm_var": 3.487744140625, "learning_rate": 0.0001, "loss": 11.6014, "loss/crossentropy": 2.5585442781448364, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.5095447897911072, "step": 308 }, { "epoch": 0.019375, "grad_norm": 6.21875, "grad_norm_var": 2.76822509765625, "learning_rate": 0.0001, "loss": 11.567, "loss/crossentropy": 2.4677284955978394, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.4294033944606781, "step": 310 }, { "epoch": 0.0195, "grad_norm": 6.25, "grad_norm_var": 0.22245686848958332, "learning_rate": 0.0001, "loss": 11.2982, "loss/crossentropy": 2.3961949348449707, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.4119955450296402, "step": 312 }, { "epoch": 0.019625, "grad_norm": 5.9375, "grad_norm_var": 0.32304280598958335, "learning_rate": 0.0001, "loss": 11.5074, "loss/crossentropy": 2.310800790786743, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.42905446887016296, "step": 314 }, { "epoch": 0.01975, "grad_norm": 5.8125, "grad_norm_var": 0.33020426432291666, "learning_rate": 0.0001, "loss": 11.3422, "loss/crossentropy": 2.4780253171920776, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.45472322404384613, "step": 316 }, { "epoch": 0.019875, "grad_norm": 5.9375, "grad_norm_var": 0.33508707682291666, "learning_rate": 0.0001, "loss": 11.5055, "loss/crossentropy": 2.6155530214309692, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.48140254616737366, "step": 318 }, { "epoch": 0.02, "grad_norm": 6.5, "grad_norm_var": 0.31060791015625, "learning_rate": 0.0001, "loss": 11.3301, "loss/crossentropy": 2.5840269327163696, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.4504729211330414, "step": 320 }, { "epoch": 0.020125, "grad_norm": 5.9375, "grad_norm_var": 0.3453125, "learning_rate": 0.0001, "loss": 11.7327, "loss/crossentropy": 2.5163029432296753, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.45117421448230743, "step": 322 }, { "epoch": 0.02025, "grad_norm": 5.375, "grad_norm_var": 0.71353759765625, "learning_rate": 0.0001, "loss": 11.4897, "loss/crossentropy": 2.4774335622787476, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.4575677663087845, "step": 324 }, { "epoch": 0.020375, "grad_norm": 6.78125, "grad_norm_var": 0.67857666015625, "learning_rate": 0.0001, "loss": 11.1918, "loss/crossentropy": 2.431584596633911, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.4284675121307373, "step": 326 }, { "epoch": 0.0205, "grad_norm": 6.03125, "grad_norm_var": 0.6340779622395833, "learning_rate": 0.0001, "loss": 11.2356, "loss/crossentropy": 2.2354328632354736, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.44321516156196594, "step": 328 }, { "epoch": 0.020625, "grad_norm": 5.4375, "grad_norm_var": 0.5881510416666667, "learning_rate": 0.0001, "loss": 11.3096, "loss/crossentropy": 2.349725842475891, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.39839838445186615, "step": 330 }, { "epoch": 0.02075, "grad_norm": 5.625, "grad_norm_var": 0.6135050455729166, "learning_rate": 0.0001, "loss": 11.5636, "loss/crossentropy": 2.704393744468689, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.42811933159828186, "step": 332 }, { "epoch": 0.020875, "grad_norm": 5.71875, "grad_norm_var": 0.6152180989583333, "learning_rate": 0.0001, "loss": 11.1634, "loss/crossentropy": 2.2040624618530273, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.38388490676879883, "step": 334 }, { "epoch": 0.021, "grad_norm": 7.15625, "grad_norm_var": 0.6815104166666667, "learning_rate": 0.0001, "loss": 11.2019, "loss/crossentropy": 2.334352135658264, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.4286506623029709, "step": 336 }, { "epoch": 0.021125, "grad_norm": 6.21875, "grad_norm_var": 0.6352213541666667, "learning_rate": 0.0001, "loss": 11.1504, "loss/crossentropy": 2.390920877456665, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.4297266751527786, "step": 338 }, { "epoch": 0.02125, "grad_norm": 5.40625, "grad_norm_var": 0.27034098307291665, "learning_rate": 0.0001, "loss": 10.968, "loss/crossentropy": 2.589638829231262, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.41065070033073425, "step": 340 }, { "epoch": 0.021375, "grad_norm": 6.59375, "grad_norm_var": 1.3688151041666667, "learning_rate": 0.0001, "loss": 11.5793, "loss/crossentropy": 2.735411524772644, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.43197204172611237, "step": 342 }, { "epoch": 0.0215, "grad_norm": 6.34375, "grad_norm_var": 1.35992431640625, "learning_rate": 0.0001, "loss": 11.1228, "loss/crossentropy": 2.424543857574463, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.4435572326183319, "step": 344 }, { "epoch": 0.021625, "grad_norm": 6.15625, "grad_norm_var": 1.3089803059895833, "learning_rate": 0.0001, "loss": 11.3887, "loss/crossentropy": 2.3145734071731567, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.4381801038980484, "step": 346 }, { "epoch": 0.02175, "grad_norm": 5.90625, "grad_norm_var": 1.2807576497395834, "learning_rate": 0.0001, "loss": 11.3132, "loss/crossentropy": 2.4313782453536987, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.44939403235912323, "step": 348 }, { "epoch": 0.021875, "grad_norm": 6.3125, "grad_norm_var": 1.2580078125, "learning_rate": 0.0001, "loss": 11.2425, "loss/crossentropy": 2.2621657848358154, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.4138593226671219, "step": 350 }, { "epoch": 0.022, "grad_norm": 5.78125, "grad_norm_var": 1.2151652018229167, "learning_rate": 0.0001, "loss": 11.4478, "loss/crossentropy": 2.9294599294662476, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.43130405247211456, "step": 352 }, { "epoch": 0.022125, "grad_norm": 5.28125, "grad_norm_var": 1.3285807291666667, "learning_rate": 0.0001, "loss": 11.0711, "loss/crossentropy": 2.4152419567108154, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.4246339052915573, "step": 354 }, { "epoch": 0.02225, "grad_norm": 5.28125, "grad_norm_var": 1.4159993489583333, "learning_rate": 0.0001, "loss": 10.4805, "loss/crossentropy": 2.342584490776062, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.39343252778053284, "step": 356 }, { "epoch": 0.022375, "grad_norm": 5.3125, "grad_norm_var": 0.24654947916666667, "learning_rate": 0.0001, "loss": 10.7117, "loss/crossentropy": 2.268111824989319, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.37643152475357056, "step": 358 }, { "epoch": 0.0225, "grad_norm": 5.4375, "grad_norm_var": 0.26873372395833334, "learning_rate": 0.0001, "loss": 10.8794, "loss/crossentropy": 2.2588669061660767, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.38288983702659607, "step": 360 }, { "epoch": 0.022625, "grad_norm": 5.46875, "grad_norm_var": 0.2638631184895833, "learning_rate": 0.0001, "loss": 11.0366, "loss/crossentropy": 2.458423137664795, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.42026595771312714, "step": 362 }, { "epoch": 0.02275, "grad_norm": 6.8125, "grad_norm_var": 0.31164957682291666, "learning_rate": 0.0001, "loss": 11.3242, "loss/crossentropy": 2.321816086769104, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.3996615409851074, "step": 364 }, { "epoch": 0.022875, "grad_norm": 5.4375, "grad_norm_var": 0.30230712890625, "learning_rate": 0.0001, "loss": 10.7526, "loss/crossentropy": 2.370081901550293, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.39483100175857544, "step": 366 }, { "epoch": 0.023, "grad_norm": 5.40625, "grad_norm_var": 0.22395833333333334, "learning_rate": 0.0001, "loss": 10.7775, "loss/crossentropy": 2.292533278465271, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4388478994369507, "step": 368 }, { "epoch": 0.023125, "grad_norm": 5.5, "grad_norm_var": 0.22610270182291667, "learning_rate": 0.0001, "loss": 10.8841, "loss/crossentropy": 2.3297876119613647, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4065796434879303, "step": 370 }, { "epoch": 0.02325, "grad_norm": 6.125, "grad_norm_var": 0.23635660807291667, "learning_rate": 0.0001, "loss": 10.7713, "loss/crossentropy": 2.5502147674560547, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.3998092859983444, "step": 372 }, { "epoch": 0.023375, "grad_norm": 5.15625, "grad_norm_var": 0.23821614583333334, "learning_rate": 0.0001, "loss": 11.0205, "loss/crossentropy": 2.385592818260193, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.4194333106279373, "step": 374 }, { "epoch": 0.0235, "grad_norm": 6.09375, "grad_norm_var": 0.23645833333333333, "learning_rate": 0.0001, "loss": 10.8672, "loss/crossentropy": 2.4565255641937256, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.4082058221101761, "step": 376 }, { "epoch": 0.023625, "grad_norm": 5.28125, "grad_norm_var": 0.27864176432291665, "learning_rate": 0.0001, "loss": 11.2051, "loss/crossentropy": 2.621595621109009, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.44046278297901154, "step": 378 }, { "epoch": 0.02375, "grad_norm": 5.875, "grad_norm_var": 0.18631184895833333, "learning_rate": 0.0001, "loss": 11.0582, "loss/crossentropy": 2.856778144836426, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.42068275809288025, "step": 380 }, { "epoch": 0.023875, "grad_norm": 5.96875, "grad_norm_var": 0.18183186848958333, "learning_rate": 0.0001, "loss": 10.952, "loss/crossentropy": 2.5854907035827637, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.40150247514247894, "step": 382 }, { "epoch": 0.024, "grad_norm": 5.8125, "grad_norm_var": 0.17877604166666666, "learning_rate": 0.0001, "loss": 10.7008, "loss/crossentropy": 2.3077510595321655, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.39931294322013855, "step": 384 }, { "epoch": 0.024125, "grad_norm": 5.40625, "grad_norm_var": 0.172509765625, "learning_rate": 0.0001, "loss": 10.931, "loss/crossentropy": 2.533818006515503, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.43096111714839935, "step": 386 }, { "epoch": 0.02425, "grad_norm": 6.5625, "grad_norm_var": 0.21328125, "learning_rate": 0.0001, "loss": 10.9486, "loss/crossentropy": 2.2463923692703247, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.4105375409126282, "step": 388 }, { "epoch": 0.024375, "grad_norm": 6.40625, "grad_norm_var": 0.21083577473958334, "learning_rate": 0.0001, "loss": 10.983, "loss/crossentropy": 2.6630618572235107, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.4139321595430374, "step": 390 }, { "epoch": 0.0245, "grad_norm": 4.8125, "grad_norm_var": 0.24898681640625, "learning_rate": 0.0001, "loss": 10.6566, "loss/crossentropy": 2.309110641479492, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.35926803946495056, "step": 392 }, { "epoch": 0.024625, "grad_norm": 5.4375, "grad_norm_var": 0.21187744140625, "learning_rate": 0.0001, "loss": 10.6611, "loss/crossentropy": 2.554847478866577, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4162745624780655, "step": 394 }, { "epoch": 0.02475, "grad_norm": 5.21875, "grad_norm_var": 0.21630452473958334, "learning_rate": 0.0001, "loss": 10.5844, "loss/crossentropy": 2.6489609479904175, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.40658123791217804, "step": 396 }, { "epoch": 0.024875, "grad_norm": 5.03125, "grad_norm_var": 0.21614176432291668, "learning_rate": 0.0001, "loss": 10.7177, "loss/crossentropy": 2.542907238006592, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3921196609735489, "step": 398 }, { "epoch": 0.025, "grad_norm": 5.34375, "grad_norm_var": 0.21314697265625, "learning_rate": 0.0001, "loss": 11.1447, "loss/crossentropy": 2.7548632621765137, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.42924974858760834, "step": 400 }, { "epoch": 0.025125, "grad_norm": 7.40625, "grad_norm_var": 0.46285400390625, "learning_rate": 0.0001, "loss": 10.7675, "loss/crossentropy": 2.46126389503479, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.39891795814037323, "step": 402 }, { "epoch": 0.02525, "grad_norm": 6.0, "grad_norm_var": 0.4099609375, "learning_rate": 0.0001, "loss": 10.6513, "loss/crossentropy": 2.4365785121917725, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.38683582842350006, "step": 404 }, { "epoch": 0.025375, "grad_norm": 5.1875, "grad_norm_var": 0.379931640625, "learning_rate": 0.0001, "loss": 10.8399, "loss/crossentropy": 2.379759669303894, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.39575886726379395, "step": 406 }, { "epoch": 0.0255, "grad_norm": 5.65625, "grad_norm_var": 0.367822265625, "learning_rate": 0.0001, "loss": 10.6569, "loss/crossentropy": 2.29716956615448, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.4569971561431885, "step": 408 }, { "epoch": 0.025625, "grad_norm": 5.03125, "grad_norm_var": 0.39303385416666664, "learning_rate": 0.0001, "loss": 10.9526, "loss/crossentropy": 2.5199685096740723, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.46622008085250854, "step": 410 }, { "epoch": 0.02575, "grad_norm": 5.03125, "grad_norm_var": 0.40784098307291666, "learning_rate": 0.0001, "loss": 10.6827, "loss/crossentropy": 2.633329153060913, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3798275887966156, "step": 412 }, { "epoch": 0.025875, "grad_norm": 5.9375, "grad_norm_var": 0.42083333333333334, "learning_rate": 0.0001, "loss": 10.9265, "loss/crossentropy": 2.690458655357361, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4021689295768738, "step": 414 }, { "epoch": 0.026, "grad_norm": 5.0625, "grad_norm_var": 0.45126546223958336, "learning_rate": 0.0001, "loss": 10.9415, "loss/crossentropy": 2.5202553272247314, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.41354137659072876, "step": 416 }, { "epoch": 0.026125, "grad_norm": 5.96875, "grad_norm_var": 0.19875895182291667, "learning_rate": 0.0001, "loss": 10.868, "loss/crossentropy": 2.6249037981033325, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.4118567407131195, "step": 418 }, { "epoch": 0.02625, "grad_norm": 4.625, "grad_norm_var": 0.20584309895833333, "learning_rate": 0.0001, "loss": 10.6141, "loss/crossentropy": 2.4828044176101685, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3660377264022827, "step": 420 }, { "epoch": 0.026375, "grad_norm": 5.1875, "grad_norm_var": 0.20572916666666666, "learning_rate": 0.0001, "loss": 10.8148, "loss/crossentropy": 2.5954415798187256, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.36801303923130035, "step": 422 }, { "epoch": 0.0265, "grad_norm": 5.5, "grad_norm_var": 0.18694254557291667, "learning_rate": 0.0001, "loss": 10.6899, "loss/crossentropy": 2.2876476049423218, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.37484128773212433, "step": 424 }, { "epoch": 0.026625, "grad_norm": 5.4375, "grad_norm_var": 0.14928385416666667, "learning_rate": 0.0001, "loss": 10.92, "loss/crossentropy": 2.5665252208709717, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.42709940671920776, "step": 426 }, { "epoch": 0.02675, "grad_norm": 5.25, "grad_norm_var": 0.16708577473958333, "learning_rate": 0.0001, "loss": 10.9522, "loss/crossentropy": 2.5319453477859497, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.4047371447086334, "step": 428 }, { "epoch": 0.026875, "grad_norm": 6.28125, "grad_norm_var": 0.18268229166666666, "learning_rate": 0.0001, "loss": 10.7146, "loss/crossentropy": 2.5017552375793457, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.40194234251976013, "step": 430 }, { "epoch": 0.027, "grad_norm": 4.96875, "grad_norm_var": 0.20439046223958332, "learning_rate": 0.0001, "loss": 10.5358, "loss/crossentropy": 2.3956456184387207, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.3840651959180832, "step": 432 }, { "epoch": 0.027125, "grad_norm": 5.4375, "grad_norm_var": 0.19501546223958333, "learning_rate": 0.0001, "loss": 10.8501, "loss/crossentropy": 2.5979639291763306, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.4386890381574631, "step": 434 }, { "epoch": 0.02725, "grad_norm": 5.5, "grad_norm_var": 0.16083577473958333, "learning_rate": 0.0001, "loss": 10.6397, "loss/crossentropy": 2.707968592643738, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3836878836154938, "step": 436 }, { "epoch": 0.027375, "grad_norm": 4.78125, "grad_norm_var": 0.18684895833333334, "learning_rate": 0.0001, "loss": 10.8105, "loss/crossentropy": 2.6276891231536865, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.40945254266262054, "step": 438 }, { "epoch": 0.0275, "grad_norm": 5.0625, "grad_norm_var": 0.20623372395833334, "learning_rate": 0.0001, "loss": 10.1525, "loss/crossentropy": 2.269914984703064, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.36995893716812134, "step": 440 }, { "epoch": 0.027625, "grad_norm": 5.1875, "grad_norm_var": 0.20367431640625, "learning_rate": 0.0001, "loss": 10.59, "loss/crossentropy": 2.4260438680648804, "loss/hidden": 4.0390625, "loss/jsd": 0.0, "loss/logits": 0.40428994596004486, "step": 442 }, { "epoch": 0.02775, "grad_norm": 5.0625, "grad_norm_var": 0.16868489583333332, "learning_rate": 0.0001, "loss": 10.6862, "loss/crossentropy": 2.319399118423462, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3804202526807785, "step": 444 }, { "epoch": 0.027875, "grad_norm": 5.8125, "grad_norm_var": 0.11953125, "learning_rate": 0.0001, "loss": 10.7438, "loss/crossentropy": 2.556222915649414, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.3961835205554962, "step": 446 }, { "epoch": 0.028, "grad_norm": 4.78125, "grad_norm_var": 0.12519124348958333, "learning_rate": 0.0001, "loss": 10.7723, "loss/crossentropy": 2.5780078172683716, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.4192471206188202, "step": 448 }, { "epoch": 0.028125, "grad_norm": 5.8125, "grad_norm_var": 0.124462890625, "learning_rate": 0.0001, "loss": 10.7281, "loss/crossentropy": 2.4442174434661865, "loss/hidden": 4.34375, "loss/jsd": 0.0, "loss/logits": 0.44460536539554596, "step": 450 }, { "epoch": 0.02825, "grad_norm": 5.0, "grad_norm_var": 0.12102864583333334, "learning_rate": 0.0001, "loss": 10.5439, "loss/crossentropy": 2.3095200061798096, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.3880293220281601, "step": 452 }, { "epoch": 0.028375, "grad_norm": 5.03125, "grad_norm_var": 0.11168212890625, "learning_rate": 0.0001, "loss": 10.4828, "loss/crossentropy": 2.263134002685547, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.4143233299255371, "step": 454 }, { "epoch": 0.0285, "grad_norm": 5.40625, "grad_norm_var": 0.10038655598958333, "learning_rate": 0.0001, "loss": 10.7777, "loss/crossentropy": 2.4948445558547974, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.4150776267051697, "step": 456 }, { "epoch": 0.028625, "grad_norm": 4.5, "grad_norm_var": 0.14049072265625, "learning_rate": 0.0001, "loss": 10.2437, "loss/crossentropy": 2.6246066093444824, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.4077821969985962, "step": 458 }, { "epoch": 0.02875, "grad_norm": 4.8125, "grad_norm_var": 0.19472249348958334, "learning_rate": 0.0001, "loss": 10.701, "loss/crossentropy": 2.5822086334228516, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.390767902135849, "step": 460 }, { "epoch": 0.028875, "grad_norm": 4.96875, "grad_norm_var": 0.173046875, "learning_rate": 0.0001, "loss": 10.7852, "loss/crossentropy": 2.5109020471572876, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.35967275500297546, "step": 462 }, { "epoch": 0.029, "grad_norm": 5.0, "grad_norm_var": 0.15071207682291668, "learning_rate": 0.0001, "loss": 10.6219, "loss/crossentropy": 2.407975435256958, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.3790464997291565, "step": 464 }, { "epoch": 0.029125, "grad_norm": 5.53125, "grad_norm_var": 0.13730061848958333, "learning_rate": 0.0001, "loss": 10.3357, "loss/crossentropy": 2.564648985862732, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.3769105225801468, "step": 466 }, { "epoch": 0.02925, "grad_norm": 4.96875, "grad_norm_var": 0.1544921875, "learning_rate": 0.0001, "loss": 10.566, "loss/crossentropy": 2.4403003454208374, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.3570362627506256, "step": 468 }, { "epoch": 0.029375, "grad_norm": 4.53125, "grad_norm_var": 0.18782145182291668, "learning_rate": 0.0001, "loss": 10.4887, "loss/crossentropy": 2.3666934967041016, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.40434208512306213, "step": 470 }, { "epoch": 0.0295, "grad_norm": 5.46875, "grad_norm_var": 0.189697265625, "learning_rate": 0.0001, "loss": 10.7792, "loss/crossentropy": 2.679360866546631, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.383465513586998, "step": 472 }, { "epoch": 0.029625, "grad_norm": 6.09375, "grad_norm_var": 1.400390625, "learning_rate": 0.0001, "loss": 10.7712, "loss/crossentropy": 2.700055718421936, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.412184476852417, "step": 474 }, { "epoch": 0.02975, "grad_norm": 5.21875, "grad_norm_var": 1.6235026041666667, "learning_rate": 0.0001, "loss": 10.6595, "loss/crossentropy": 2.456274390220642, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.40182630717754364, "step": 476 }, { "epoch": 0.029875, "grad_norm": 4.78125, "grad_norm_var": 1.6465983072916666, "learning_rate": 0.0001, "loss": 10.7663, "loss/crossentropy": 2.4097973108291626, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.3738597333431244, "step": 478 }, { "epoch": 0.03, "grad_norm": 5.8125, "grad_norm_var": 1.5962076822916667, "learning_rate": 0.0001, "loss": 10.7901, "loss/crossentropy": 2.4475165605545044, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.40748435258865356, "step": 480 }, { "epoch": 0.030125, "grad_norm": 4.75, "grad_norm_var": 1.6552042643229166, "learning_rate": 0.0001, "loss": 10.1387, "loss/crossentropy": 2.28298556804657, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.36281222105026245, "step": 482 }, { "epoch": 0.03025, "grad_norm": 5.9375, "grad_norm_var": 1.6994425455729167, "learning_rate": 0.0001, "loss": 10.316, "loss/crossentropy": 2.3355804681777954, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.34566931426525116, "step": 484 }, { "epoch": 0.030375, "grad_norm": 4.8125, "grad_norm_var": 1.6235026041666667, "learning_rate": 0.0001, "loss": 10.6935, "loss/crossentropy": 2.6970983743667603, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.4392566382884979, "step": 486 }, { "epoch": 0.0305, "grad_norm": 7.625, "grad_norm_var": 1.9055826822916666, "learning_rate": 0.0001, "loss": 10.5326, "loss/crossentropy": 2.4185194969177246, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.3976728916168213, "step": 488 }, { "epoch": 0.030625, "grad_norm": 5.09375, "grad_norm_var": 0.86724853515625, "learning_rate": 0.0001, "loss": 10.5936, "loss/crossentropy": 2.390311121940613, "loss/hidden": 4.0859375, "loss/jsd": 0.0, "loss/logits": 0.379798486828804, "step": 490 }, { "epoch": 0.03075, "grad_norm": 4.71875, "grad_norm_var": 0.5812337239583333, "learning_rate": 0.0001, "loss": 10.4348, "loss/crossentropy": 2.5370208024978638, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.3812776803970337, "step": 492 }, { "epoch": 0.030875, "grad_norm": 4.53125, "grad_norm_var": 0.59869384765625, "learning_rate": 0.0001, "loss": 10.2674, "loss/crossentropy": 2.400723934173584, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3432563245296478, "step": 494 }, { "epoch": 0.031, "grad_norm": 4.71875, "grad_norm_var": 0.575634765625, "learning_rate": 0.0001, "loss": 10.3094, "loss/crossentropy": 2.44759202003479, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.3646901249885559, "step": 496 }, { "epoch": 0.031125, "grad_norm": 5.71875, "grad_norm_var": 0.5779296875, "learning_rate": 0.0001, "loss": 10.6323, "loss/crossentropy": 2.4714183807373047, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.4395069479942322, "step": 498 }, { "epoch": 0.03125, "grad_norm": 4.875, "grad_norm_var": 0.5239420572916667, "learning_rate": 0.0001, "loss": 10.5234, "loss/crossentropy": 2.425844192504883, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3750711977481842, "step": 500 }, { "epoch": 0.031375, "grad_norm": 4.8125, "grad_norm_var": 0.5263631184895833, "learning_rate": 0.0001, "loss": 10.0027, "loss/crossentropy": 2.187627673149109, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.3448432832956314, "step": 502 }, { "epoch": 0.0315, "grad_norm": 4.75, "grad_norm_var": 0.08319905598958334, "learning_rate": 0.0001, "loss": 10.3973, "loss/crossentropy": 2.592836856842041, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3808598816394806, "step": 504 }, { "epoch": 0.031625, "grad_norm": 4.78125, "grad_norm_var": 0.086181640625, "learning_rate": 0.0001, "loss": 10.3901, "loss/crossentropy": 2.621356964111328, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.37457969784736633, "step": 506 }, { "epoch": 0.03175, "grad_norm": 13.375, "grad_norm_var": 4.523758951822916, "learning_rate": 0.0001, "loss": 10.7639, "loss/crossentropy": 2.4040629863739014, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.40050315856933594, "step": 508 }, { "epoch": 0.031875, "grad_norm": 6.40625, "grad_norm_var": 4.598368326822917, "learning_rate": 0.0001, "loss": 10.5925, "loss/crossentropy": 2.6625452041625977, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.42064009606838226, "step": 510 }, { "epoch": 0.032, "grad_norm": 4.6875, "grad_norm_var": 4.583463541666666, "learning_rate": 0.0001, "loss": 10.6113, "loss/crossentropy": 2.3583052158355713, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.40322598814964294, "step": 512 }, { "epoch": 0.032125, "grad_norm": 5.25, "grad_norm_var": 4.567867024739583, "learning_rate": 0.0001, "loss": 10.4471, "loss/crossentropy": 2.333785891532898, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.37443122267723083, "step": 514 }, { "epoch": 0.03225, "grad_norm": 4.65625, "grad_norm_var": 4.6419921875, "learning_rate": 0.0001, "loss": 10.1493, "loss/crossentropy": 2.500606060028076, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3583361357450485, "step": 516 }, { "epoch": 0.032375, "grad_norm": 4.78125, "grad_norm_var": 4.674247233072917, "learning_rate": 0.0001, "loss": 10.1221, "loss/crossentropy": 2.201894164085388, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.35936446487903595, "step": 518 }, { "epoch": 0.0325, "grad_norm": 4.625, "grad_norm_var": 4.658784993489584, "learning_rate": 0.0001, "loss": 10.7879, "loss/crossentropy": 2.422861099243164, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.4262392073869705, "step": 520 }, { "epoch": 0.032625, "grad_norm": 6.15625, "grad_norm_var": 4.619755045572917, "learning_rate": 0.0001, "loss": 10.5335, "loss/crossentropy": 2.609155774116516, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.40598247945308685, "step": 522 }, { "epoch": 0.03275, "grad_norm": 4.9375, "grad_norm_var": 0.5051920572916667, "learning_rate": 0.0001, "loss": 10.2853, "loss/crossentropy": 2.4510494470596313, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.4115123152732849, "step": 524 }, { "epoch": 0.032875, "grad_norm": 4.65625, "grad_norm_var": 0.26568603515625, "learning_rate": 0.0001, "loss": 10.7524, "loss/crossentropy": 2.604946732521057, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.41282832622528076, "step": 526 }, { "epoch": 0.033, "grad_norm": 10.4375, "grad_norm_var": 2.0192545572916667, "learning_rate": 0.0001, "loss": 10.4002, "loss/crossentropy": 2.611866593360901, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.37592028081417084, "step": 528 }, { "epoch": 0.033125, "grad_norm": 5.0, "grad_norm_var": 2.0321451822916665, "learning_rate": 0.0001, "loss": 10.4279, "loss/crossentropy": 2.599483370780945, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.37497493624687195, "step": 530 }, { "epoch": 0.03325, "grad_norm": 4.6875, "grad_norm_var": 2.026497395833333, "learning_rate": 0.0001, "loss": 10.1858, "loss/crossentropy": 2.2957273721694946, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.36572718620300293, "step": 532 }, { "epoch": 0.033375, "grad_norm": 4.8125, "grad_norm_var": 1.99713134765625, "learning_rate": 0.0001, "loss": 10.4544, "loss/crossentropy": 2.44324791431427, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.4093717336654663, "step": 534 }, { "epoch": 0.0335, "grad_norm": 4.84375, "grad_norm_var": 1.9977701822916667, "learning_rate": 0.0001, "loss": 10.2236, "loss/crossentropy": 2.415123224258423, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.36705660820007324, "step": 536 }, { "epoch": 0.033625, "grad_norm": 4.875, "grad_norm_var": 1.9493326822916666, "learning_rate": 0.0001, "loss": 10.5376, "loss/crossentropy": 2.466187596321106, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.40770016610622406, "step": 538 }, { "epoch": 0.03375, "grad_norm": 6.03125, "grad_norm_var": 1.965478515625, "learning_rate": 0.0001, "loss": 10.4253, "loss/crossentropy": 2.379727602005005, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3828308582305908, "step": 540 }, { "epoch": 0.033875, "grad_norm": 5.5625, "grad_norm_var": 1.9148274739583333, "learning_rate": 0.0001, "loss": 10.4729, "loss/crossentropy": 2.502163052558899, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.43099866807460785, "step": 542 }, { "epoch": 0.034, "grad_norm": 4.9375, "grad_norm_var": 0.15963541666666667, "learning_rate": 0.0001, "loss": 10.353, "loss/crossentropy": 2.501845955848694, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.3675364851951599, "step": 544 }, { "epoch": 0.034125, "grad_norm": 5.40625, "grad_norm_var": 0.18144124348958332, "learning_rate": 0.0001, "loss": 10.2907, "loss/crossentropy": 2.376683473587036, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.41435733437538147, "step": 546 }, { "epoch": 0.03425, "grad_norm": 5.34375, "grad_norm_var": 0.17939046223958333, "learning_rate": 0.0001, "loss": 10.2278, "loss/crossentropy": 2.230885148048401, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.3870948702096939, "step": 548 }, { "epoch": 0.034375, "grad_norm": 4.96875, "grad_norm_var": 0.18331705729166667, "learning_rate": 0.0001, "loss": 10.0963, "loss/crossentropy": 2.3854016065597534, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.3725956082344055, "step": 550 }, { "epoch": 0.0345, "grad_norm": 4.8125, "grad_norm_var": 0.18017171223958334, "learning_rate": 0.0001, "loss": 10.4064, "loss/crossentropy": 2.30772066116333, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.41719433665275574, "step": 552 }, { "epoch": 0.034625, "grad_norm": 4.9375, "grad_norm_var": 0.19784749348958333, "learning_rate": 0.0001, "loss": 9.9904, "loss/crossentropy": 2.4518308639526367, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.367490217089653, "step": 554 }, { "epoch": 0.03475, "grad_norm": 4.65625, "grad_norm_var": 0.145166015625, "learning_rate": 0.0001, "loss": 10.186, "loss/crossentropy": 2.5720479488372803, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.36091138422489166, "step": 556 }, { "epoch": 0.034875, "grad_norm": 4.65625, "grad_norm_var": 0.09309488932291667, "learning_rate": 0.0001, "loss": 10.0815, "loss/crossentropy": 2.4808801412582397, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.3723009526729584, "step": 558 }, { "epoch": 0.035, "grad_norm": 4.71875, "grad_norm_var": 0.09099934895833334, "learning_rate": 0.0001, "loss": 10.0476, "loss/crossentropy": 2.4010642766952515, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.3671903610229492, "step": 560 }, { "epoch": 0.035125, "grad_norm": 5.375, "grad_norm_var": 0.08674723307291667, "learning_rate": 0.0001, "loss": 10.4371, "loss/crossentropy": 2.4921680688858032, "loss/hidden": 4.1015625, "loss/jsd": 0.0, "loss/logits": 0.4616352915763855, "step": 562 }, { "epoch": 0.03525, "grad_norm": 4.6875, "grad_norm_var": 0.049540201822916664, "learning_rate": 0.0001, "loss": 10.2687, "loss/crossentropy": 2.5980935096740723, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.4135672152042389, "step": 564 }, { "epoch": 0.035375, "grad_norm": 5.0625, "grad_norm_var": 0.053446451822916664, "learning_rate": 0.0001, "loss": 10.2881, "loss/crossentropy": 2.515305280685425, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.3722418546676636, "step": 566 }, { "epoch": 0.0355, "grad_norm": 4.65625, "grad_norm_var": 0.06464436848958334, "learning_rate": 0.0001, "loss": 10.2975, "loss/crossentropy": 2.559085965156555, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.42906875908374786, "step": 568 }, { "epoch": 0.035625, "grad_norm": 4.3125, "grad_norm_var": 0.08541666666666667, "learning_rate": 0.0001, "loss": 9.8115, "loss/crossentropy": 2.2619433403015137, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3420299142599106, "step": 570 }, { "epoch": 0.03575, "grad_norm": 4.5, "grad_norm_var": 0.14739583333333334, "learning_rate": 0.0001, "loss": 10.4233, "loss/crossentropy": 2.5262789726257324, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.39023733139038086, "step": 572 }, { "epoch": 0.035875, "grad_norm": 4.90625, "grad_norm_var": 0.30859375, "learning_rate": 0.0001, "loss": 10.3246, "loss/crossentropy": 2.567444682121277, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3690713047981262, "step": 574 }, { "epoch": 0.036, "grad_norm": 4.96875, "grad_norm_var": 0.30383707682291666, "learning_rate": 0.0001, "loss": 10.355, "loss/crossentropy": 2.6742849349975586, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.37552310526371, "step": 576 }, { "epoch": 0.036125, "grad_norm": 4.40625, "grad_norm_var": 0.3316243489583333, "learning_rate": 0.0001, "loss": 10.1256, "loss/crossentropy": 2.3429067134857178, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.37690627574920654, "step": 578 }, { "epoch": 0.03625, "grad_norm": 5.3125, "grad_norm_var": 0.47980143229166666, "learning_rate": 0.0001, "loss": 10.6361, "loss/crossentropy": 2.4363961219787598, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.4327695965766907, "step": 580 }, { "epoch": 0.036375, "grad_norm": 4.46875, "grad_norm_var": 0.49347330729166666, "learning_rate": 0.0001, "loss": 10.2303, "loss/crossentropy": 2.6203149557113647, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.3839530050754547, "step": 582 }, { "epoch": 0.0365, "grad_norm": 4.6875, "grad_norm_var": 0.503125, "learning_rate": 0.0001, "loss": 10.4335, "loss/crossentropy": 2.705429792404175, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3913826197385788, "step": 584 }, { "epoch": 0.036625, "grad_norm": 5.59375, "grad_norm_var": 0.46151936848958336, "learning_rate": 0.0001, "loss": 10.1133, "loss/crossentropy": 2.2338947057724, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3503521531820297, "step": 586 }, { "epoch": 0.03675, "grad_norm": 4.3125, "grad_norm_var": 0.4554036458333333, "learning_rate": 0.0001, "loss": 10.2664, "loss/crossentropy": 2.4711296558380127, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.39591944217681885, "step": 588 }, { "epoch": 0.036875, "grad_norm": 4.40625, "grad_norm_var": 0.38765869140625, "learning_rate": 0.0001, "loss": 10.2556, "loss/crossentropy": 2.4357420206069946, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.38228775560855865, "step": 590 }, { "epoch": 0.037, "grad_norm": 5.0625, "grad_norm_var": 0.3698201497395833, "learning_rate": 0.0001, "loss": 10.1555, "loss/crossentropy": 2.263747811317444, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.40818026661872864, "step": 592 }, { "epoch": 0.037125, "grad_norm": 4.5, "grad_norm_var": 0.3490193684895833, "learning_rate": 0.0001, "loss": 9.9666, "loss/crossentropy": 2.54870069026947, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.37873171269893646, "step": 594 }, { "epoch": 0.03725, "grad_norm": 5.96875, "grad_norm_var": 0.25271809895833336, "learning_rate": 0.0001, "loss": 10.1816, "loss/crossentropy": 2.5280078649520874, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.3799082934856415, "step": 596 }, { "epoch": 0.037375, "grad_norm": 4.53125, "grad_norm_var": 0.266259765625, "learning_rate": 0.0001, "loss": 9.871, "loss/crossentropy": 2.1612548232078552, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3508923500776291, "step": 598 }, { "epoch": 0.0375, "grad_norm": 5.4375, "grad_norm_var": 0.28664957682291664, "learning_rate": 0.0001, "loss": 10.0122, "loss/crossentropy": 2.33645498752594, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.34663376212120056, "step": 600 }, { "epoch": 0.037625, "grad_norm": 4.15625, "grad_norm_var": 0.25558268229166664, "learning_rate": 0.0001, "loss": 9.8853, "loss/crossentropy": 2.263962745666504, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3584403544664383, "step": 602 }, { "epoch": 0.03775, "grad_norm": 4.9375, "grad_norm_var": 0.243603515625, "learning_rate": 0.0001, "loss": 9.9895, "loss/crossentropy": 2.4892383813858032, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.3693755269050598, "step": 604 }, { "epoch": 0.037875, "grad_norm": 4.5, "grad_norm_var": 0.218603515625, "learning_rate": 0.0001, "loss": 9.9086, "loss/crossentropy": 2.5046552419662476, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3778345286846161, "step": 606 }, { "epoch": 0.038, "grad_norm": 4.46875, "grad_norm_var": 0.21441650390625, "learning_rate": 0.0001, "loss": 9.9579, "loss/crossentropy": 2.383268356323242, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.35156671702861786, "step": 608 }, { "epoch": 0.038125, "grad_norm": 4.59375, "grad_norm_var": 0.21565348307291668, "learning_rate": 0.0001, "loss": 10.1245, "loss/crossentropy": 2.6101828813552856, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.36821986734867096, "step": 610 }, { "epoch": 0.03825, "grad_norm": 4.5625, "grad_norm_var": 0.10784098307291666, "learning_rate": 0.0001, "loss": 9.6363, "loss/crossentropy": 2.154883623123169, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.351752445101738, "step": 612 }, { "epoch": 0.038375, "grad_norm": 4.65625, "grad_norm_var": 0.10461832682291666, "learning_rate": 0.0001, "loss": 10.1564, "loss/crossentropy": 2.7461551427841187, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.40302354097366333, "step": 614 }, { "epoch": 0.0385, "grad_norm": 4.65625, "grad_norm_var": 0.04967447916666667, "learning_rate": 0.0001, "loss": 9.964, "loss/crossentropy": 2.215000867843628, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.37297672033309937, "step": 616 }, { "epoch": 0.038625, "grad_norm": 5.03125, "grad_norm_var": 0.0564453125, "learning_rate": 0.0001, "loss": 9.8702, "loss/crossentropy": 2.5331451892852783, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3476633280515671, "step": 618 }, { "epoch": 0.03875, "grad_norm": 4.6875, "grad_norm_var": 0.07550455729166666, "learning_rate": 0.0001, "loss": 10.4216, "loss/crossentropy": 2.4520362615585327, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.38907913863658905, "step": 620 }, { "epoch": 0.038875, "grad_norm": 4.1875, "grad_norm_var": 0.0876953125, "learning_rate": 0.0001, "loss": 9.9443, "loss/crossentropy": 2.1309529542922974, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3472695052623749, "step": 622 }, { "epoch": 0.039, "grad_norm": 5.0625, "grad_norm_var": 0.10517171223958334, "learning_rate": 0.0001, "loss": 10.2231, "loss/crossentropy": 2.5359339714050293, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.3966711014509201, "step": 624 }, { "epoch": 0.039125, "grad_norm": 5.34375, "grad_norm_var": 0.13763020833333334, "learning_rate": 0.0001, "loss": 10.0318, "loss/crossentropy": 2.4495298862457275, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.3523852229118347, "step": 626 }, { "epoch": 0.03925, "grad_norm": 4.6875, "grad_norm_var": 0.12646077473958334, "learning_rate": 0.0001, "loss": 10.1339, "loss/crossentropy": 2.4048619270324707, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.37286487221717834, "step": 628 }, { "epoch": 0.039375, "grad_norm": 4.0625, "grad_norm_var": 0.15050455729166667, "learning_rate": 0.0001, "loss": 9.5416, "loss/crossentropy": 2.2729824781417847, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.35299360752105713, "step": 630 }, { "epoch": 0.0395, "grad_norm": 4.625, "grad_norm_var": 0.14390869140625, "learning_rate": 0.0001, "loss": 9.6436, "loss/crossentropy": 2.464027762413025, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3433645963668823, "step": 632 }, { "epoch": 0.039625, "grad_norm": 4.625, "grad_norm_var": 0.14296468098958334, "learning_rate": 0.0001, "loss": 10.0132, "loss/crossentropy": 2.260706663131714, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.358672633767128, "step": 634 }, { "epoch": 0.03975, "grad_norm": 5.34375, "grad_norm_var": 0.16431884765625, "learning_rate": 0.0001, "loss": 10.0665, "loss/crossentropy": 2.443928599357605, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.34917180240154266, "step": 636 }, { "epoch": 0.039875, "grad_norm": 4.15625, "grad_norm_var": 0.16360270182291667, "learning_rate": 0.0001, "loss": 9.8865, "loss/crossentropy": 2.355438470840454, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3681969791650772, "step": 638 }, { "epoch": 0.04, "grad_norm": 5.15625, "grad_norm_var": 0.17375895182291667, "learning_rate": 0.0001, "loss": 9.988, "loss/crossentropy": 2.403064250946045, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3809404671192169, "step": 640 }, { "epoch": 0.040125, "grad_norm": 5.25, "grad_norm_var": 0.16539306640625, "learning_rate": 0.0001, "loss": 9.8894, "loss/crossentropy": 2.277324080467224, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.33590181171894073, "step": 642 }, { "epoch": 0.04025, "grad_norm": 4.75, "grad_norm_var": 0.16705729166666666, "learning_rate": 0.0001, "loss": 10.0104, "loss/crossentropy": 2.6433370113372803, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.3841419368982315, "step": 644 }, { "epoch": 0.040375, "grad_norm": 4.375, "grad_norm_var": 0.148828125, "learning_rate": 0.0001, "loss": 9.9156, "loss/crossentropy": 2.5485310554504395, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.34872615337371826, "step": 646 }, { "epoch": 0.0405, "grad_norm": 4.0, "grad_norm_var": 0.17229410807291667, "learning_rate": 0.0001, "loss": 9.6353, "loss/crossentropy": 2.5063722133636475, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.36940453946590424, "step": 648 }, { "epoch": 0.040625, "grad_norm": 4.65625, "grad_norm_var": 0.18136393229166667, "learning_rate": 0.0001, "loss": 9.9078, "loss/crossentropy": 2.39488685131073, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.3586680442094803, "step": 650 }, { "epoch": 0.04075, "grad_norm": 4.59375, "grad_norm_var": 0.13019205729166666, "learning_rate": 0.0001, "loss": 9.9067, "loss/crossentropy": 2.377121686935425, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.36309733986854553, "step": 652 }, { "epoch": 0.040875, "grad_norm": 4.28125, "grad_norm_var": 0.12916666666666668, "learning_rate": 0.0001, "loss": 10.099, "loss/crossentropy": 2.5704420804977417, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3873720318078995, "step": 654 }, { "epoch": 0.041, "grad_norm": 5.90625, "grad_norm_var": 0.21584879557291667, "learning_rate": 0.0001, "loss": 9.9181, "loss/crossentropy": 2.2711371183395386, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.3648662865161896, "step": 656 }, { "epoch": 0.041125, "grad_norm": 5.25, "grad_norm_var": 0.21679280598958334, "learning_rate": 0.0001, "loss": 10.0834, "loss/crossentropy": 2.5198888778686523, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.3765381723642349, "step": 658 }, { "epoch": 0.04125, "grad_norm": 4.40625, "grad_norm_var": 0.21443684895833334, "learning_rate": 0.0001, "loss": 9.7288, "loss/crossentropy": 2.2896225452423096, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.35969309508800507, "step": 660 }, { "epoch": 0.041375, "grad_norm": 4.5625, "grad_norm_var": 0.21106770833333333, "learning_rate": 0.0001, "loss": 10.0562, "loss/crossentropy": 2.4178755283355713, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.3961552679538727, "step": 662 }, { "epoch": 0.0415, "grad_norm": 4.71875, "grad_norm_var": 0.19529622395833332, "learning_rate": 0.0001, "loss": 9.9744, "loss/crossentropy": 2.547677516937256, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.3570811301469803, "step": 664 }, { "epoch": 0.041625, "grad_norm": 4.78125, "grad_norm_var": 0.17502848307291666, "learning_rate": 0.0001, "loss": 9.8543, "loss/crossentropy": 2.4284908771514893, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3434506356716156, "step": 666 }, { "epoch": 0.04175, "grad_norm": 4.1875, "grad_norm_var": 0.19075520833333334, "learning_rate": 0.0001, "loss": 9.9655, "loss/crossentropy": 2.108651876449585, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.36953288316726685, "step": 668 }, { "epoch": 0.041875, "grad_norm": 4.71875, "grad_norm_var": 0.17646077473958333, "learning_rate": 0.0001, "loss": 10.0564, "loss/crossentropy": 2.3314043283462524, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3440769463777542, "step": 670 }, { "epoch": 0.042, "grad_norm": 4.65625, "grad_norm_var": 0.06848551432291666, "learning_rate": 0.0001, "loss": 9.8599, "loss/crossentropy": 2.3711284399032593, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.34241366386413574, "step": 672 }, { "epoch": 0.042125, "grad_norm": 5.0, "grad_norm_var": 0.05006103515625, "learning_rate": 0.0001, "loss": 9.9082, "loss/crossentropy": 2.3005160093307495, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.3360006958246231, "step": 674 }, { "epoch": 0.04225, "grad_norm": 4.09375, "grad_norm_var": 0.068359375, "learning_rate": 0.0001, "loss": 10.0805, "loss/crossentropy": 2.590296149253845, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3790188133716583, "step": 676 }, { "epoch": 0.042375, "grad_norm": 4.3125, "grad_norm_var": 0.080322265625, "learning_rate": 0.0001, "loss": 9.8713, "loss/crossentropy": 2.582974672317505, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3644176125526428, "step": 678 }, { "epoch": 0.0425, "grad_norm": 4.0625, "grad_norm_var": 0.09000244140625, "learning_rate": 0.0001, "loss": 10.0148, "loss/crossentropy": 2.6205986738204956, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3867751955986023, "step": 680 }, { "epoch": 0.042625, "grad_norm": 4.25, "grad_norm_var": 0.08826497395833334, "learning_rate": 0.0001, "loss": 9.6616, "loss/crossentropy": 2.3413681983947754, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.35291582345962524, "step": 682 }, { "epoch": 0.04275, "grad_norm": 4.78125, "grad_norm_var": 0.09547119140625, "learning_rate": 0.0001, "loss": 9.7887, "loss/crossentropy": 2.524027109146118, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3748561441898346, "step": 684 }, { "epoch": 0.042875, "grad_norm": 4.25, "grad_norm_var": 0.09693603515625, "learning_rate": 0.0001, "loss": 9.7663, "loss/crossentropy": 2.586169123649597, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3734195679426193, "step": 686 }, { "epoch": 0.043, "grad_norm": 4.25, "grad_norm_var": 0.098291015625, "learning_rate": 0.0001, "loss": 10.1415, "loss/crossentropy": 2.575216293334961, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.37598639726638794, "step": 688 }, { "epoch": 0.043125, "grad_norm": 4.4375, "grad_norm_var": 0.1171875, "learning_rate": 0.0001, "loss": 10.0767, "loss/crossentropy": 2.782447099685669, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.39427442848682404, "step": 690 }, { "epoch": 0.04325, "grad_norm": 4.59375, "grad_norm_var": 0.10546468098958334, "learning_rate": 0.0001, "loss": 10.1002, "loss/crossentropy": 2.4408079385757446, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.39183132350444794, "step": 692 }, { "epoch": 0.043375, "grad_norm": 5.09375, "grad_norm_var": 0.11373291015625, "learning_rate": 0.0001, "loss": 9.9914, "loss/crossentropy": 2.428073763847351, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.3673141598701477, "step": 694 }, { "epoch": 0.0435, "grad_norm": 4.1875, "grad_norm_var": 0.11808268229166667, "learning_rate": 0.0001, "loss": 9.8208, "loss/crossentropy": 2.6366835832595825, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.36033181846141815, "step": 696 }, { "epoch": 0.043625, "grad_norm": 5.40625, "grad_norm_var": 0.14921468098958332, "learning_rate": 0.0001, "loss": 10.0662, "loss/crossentropy": 2.119105100631714, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3732317090034485, "step": 698 }, { "epoch": 0.04375, "grad_norm": 4.34375, "grad_norm_var": 0.14996337890625, "learning_rate": 0.0001, "loss": 10.0521, "loss/crossentropy": 2.4997419118881226, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3387012630701065, "step": 700 }, { "epoch": 0.043875, "grad_norm": 3.921875, "grad_norm_var": 0.1813873291015625, "learning_rate": 0.0001, "loss": 9.7343, "loss/crossentropy": 2.2400662899017334, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3351885676383972, "step": 702 }, { "epoch": 0.044, "grad_norm": 5.21875, "grad_norm_var": 0.21665751139322917, "learning_rate": 0.0001, "loss": 9.8692, "loss/crossentropy": 2.2805423736572266, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.33280137181282043, "step": 704 }, { "epoch": 0.044125, "grad_norm": 4.8125, "grad_norm_var": 0.24158426920572917, "learning_rate": 0.0001, "loss": 10.2821, "loss/crossentropy": 2.5463110208511353, "loss/hidden": 4.0390625, "loss/jsd": 0.0, "loss/logits": 0.4047670066356659, "step": 706 }, { "epoch": 0.04425, "grad_norm": 4.28125, "grad_norm_var": 0.25898335774739584, "learning_rate": 0.0001, "loss": 9.893, "loss/crossentropy": 2.5133782625198364, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3492252826690674, "step": 708 }, { "epoch": 0.044375, "grad_norm": 4.40625, "grad_norm_var": 0.2576080322265625, "learning_rate": 0.0001, "loss": 9.7662, "loss/crossentropy": 2.58932888507843, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.36606909334659576, "step": 710 }, { "epoch": 0.0445, "grad_norm": 4.21875, "grad_norm_var": 0.24614156087239583, "learning_rate": 0.0001, "loss": 9.8397, "loss/crossentropy": 2.5551047325134277, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.37199999392032623, "step": 712 }, { "epoch": 0.044625, "grad_norm": 4.875, "grad_norm_var": 0.21269429524739583, "learning_rate": 0.0001, "loss": 9.6663, "loss/crossentropy": 2.2038984298706055, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.355471596121788, "step": 714 }, { "epoch": 0.04475, "grad_norm": 4.1875, "grad_norm_var": 0.22046610514322917, "learning_rate": 0.0001, "loss": 9.8526, "loss/crossentropy": 2.4986603260040283, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3514595031738281, "step": 716 }, { "epoch": 0.044875, "grad_norm": 4.5, "grad_norm_var": 0.17919514973958334, "learning_rate": 0.0001, "loss": 9.9912, "loss/crossentropy": 2.67462694644928, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.36870990693569183, "step": 718 }, { "epoch": 0.045, "grad_norm": 4.09375, "grad_norm_var": 0.14490559895833333, "learning_rate": 0.0001, "loss": 10.125, "loss/crossentropy": 2.4971606731414795, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.39218752086162567, "step": 720 }, { "epoch": 0.045125, "grad_norm": 4.40625, "grad_norm_var": 0.060791015625, "learning_rate": 0.0001, "loss": 9.7016, "loss/crossentropy": 2.4049805402755737, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.32926052808761597, "step": 722 }, { "epoch": 0.04525, "grad_norm": 4.28125, "grad_norm_var": 0.055859375, "learning_rate": 0.0001, "loss": 9.8177, "loss/crossentropy": 2.590659022331238, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3945636451244354, "step": 724 }, { "epoch": 0.045375, "grad_norm": 4.96875, "grad_norm_var": 0.07823893229166666, "learning_rate": 0.0001, "loss": 9.6899, "loss/crossentropy": 2.2857325077056885, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.35235145688056946, "step": 726 }, { "epoch": 0.0455, "grad_norm": 4.15625, "grad_norm_var": 0.08201497395833333, "learning_rate": 0.0001, "loss": 9.6263, "loss/crossentropy": 2.201639175415039, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.33390986919403076, "step": 728 }, { "epoch": 0.045625, "grad_norm": 5.03125, "grad_norm_var": 0.10592447916666667, "learning_rate": 0.0001, "loss": 9.9654, "loss/crossentropy": 2.546342372894287, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3447662442922592, "step": 730 }, { "epoch": 0.04575, "grad_norm": 4.3125, "grad_norm_var": 0.11259358723958333, "learning_rate": 0.0001, "loss": 9.6529, "loss/crossentropy": 2.465666890144348, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3506108671426773, "step": 732 }, { "epoch": 0.045875, "grad_norm": 4.34375, "grad_norm_var": 0.11555989583333333, "learning_rate": 0.0001, "loss": 9.7691, "loss/crossentropy": 2.4628361463546753, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3611077666282654, "step": 734 }, { "epoch": 0.046, "grad_norm": 4.53125, "grad_norm_var": 0.10852864583333334, "learning_rate": 0.0001, "loss": 9.7908, "loss/crossentropy": 2.450587034225464, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.36645807325839996, "step": 736 }, { "epoch": 0.046125, "grad_norm": 4.75, "grad_norm_var": 0.10998942057291666, "learning_rate": 0.0001, "loss": 10.0238, "loss/crossentropy": 2.5827871561050415, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.3806481957435608, "step": 738 }, { "epoch": 0.04625, "grad_norm": 3.953125, "grad_norm_var": 0.11862691243489583, "learning_rate": 0.0001, "loss": 9.453, "loss/crossentropy": 2.408301830291748, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.3236120641231537, "step": 740 }, { "epoch": 0.046375, "grad_norm": 4.34375, "grad_norm_var": 0.0968658447265625, "learning_rate": 0.0001, "loss": 9.7187, "loss/crossentropy": 2.4682952165603638, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.40484650433063507, "step": 742 }, { "epoch": 0.0465, "grad_norm": 4.5, "grad_norm_var": 0.1087799072265625, "learning_rate": 0.0001, "loss": 9.8899, "loss/crossentropy": 2.7526875734329224, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3805859088897705, "step": 744 }, { "epoch": 0.046625, "grad_norm": 4.59375, "grad_norm_var": 0.08279520670572917, "learning_rate": 0.0001, "loss": 9.8541, "loss/crossentropy": 2.268938183784485, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3416333645582199, "step": 746 }, { "epoch": 0.04675, "grad_norm": 4.5, "grad_norm_var": 0.07593485514322916, "learning_rate": 0.0001, "loss": 9.4353, "loss/crossentropy": 2.4165321588516235, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.33534686267375946, "step": 748 }, { "epoch": 0.046875, "grad_norm": 4.21875, "grad_norm_var": 0.09172261555989583, "learning_rate": 0.0001, "loss": 9.4508, "loss/crossentropy": 2.4512590169906616, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3545406609773636, "step": 750 }, { "epoch": 0.047, "grad_norm": 4.28125, "grad_norm_var": 0.09014383951822917, "learning_rate": 0.0001, "loss": 10.0917, "loss/crossentropy": 2.545518636703491, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.49751946330070496, "step": 752 }, { "epoch": 0.047125, "grad_norm": 4.34375, "grad_norm_var": 0.0890533447265625, "learning_rate": 0.0001, "loss": 9.4346, "loss/crossentropy": 2.2614606618881226, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.34531380236148834, "step": 754 }, { "epoch": 0.04725, "grad_norm": 4.3125, "grad_norm_var": 0.07870686848958333, "learning_rate": 0.0001, "loss": 9.3389, "loss/crossentropy": 2.3133562803268433, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.3800860345363617, "step": 756 }, { "epoch": 0.047375, "grad_norm": 4.75, "grad_norm_var": 0.09312744140625, "learning_rate": 0.0001, "loss": 9.9658, "loss/crossentropy": 2.5053844451904297, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.348347008228302, "step": 758 }, { "epoch": 0.0475, "grad_norm": 4.0625, "grad_norm_var": 0.06365559895833334, "learning_rate": 0.0001, "loss": 9.5957, "loss/crossentropy": 2.1510268449783325, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.3271169662475586, "step": 760 }, { "epoch": 0.047625, "grad_norm": 3.796875, "grad_norm_var": 0.0678131103515625, "learning_rate": 0.0001, "loss": 9.5272, "loss/crossentropy": 2.1917725801467896, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.32911764085292816, "step": 762 }, { "epoch": 0.04775, "grad_norm": 4.625, "grad_norm_var": 0.07316792805989583, "learning_rate": 0.0001, "loss": 9.6196, "loss/crossentropy": 2.262703061103821, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3407554179430008, "step": 764 }, { "epoch": 0.047875, "grad_norm": 4.21875, "grad_norm_var": 0.0635894775390625, "learning_rate": 0.0001, "loss": 9.5615, "loss/crossentropy": 2.2358585596084595, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.34859590232372284, "step": 766 }, { "epoch": 0.048, "grad_norm": 4.5625, "grad_norm_var": 0.0727691650390625, "learning_rate": 0.0001, "loss": 9.7362, "loss/crossentropy": 2.273680090904236, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3434666693210602, "step": 768 }, { "epoch": 0.048125, "grad_norm": 4.78125, "grad_norm_var": 0.06664937337239583, "learning_rate": 0.0001, "loss": 9.4517, "loss/crossentropy": 2.2003235816955566, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.34724757075309753, "step": 770 }, { "epoch": 0.04825, "grad_norm": 4.0625, "grad_norm_var": 0.0718170166015625, "learning_rate": 0.0001, "loss": 10.0496, "loss/crossentropy": 2.356285572052002, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.34804899990558624, "step": 772 }, { "epoch": 0.048375, "grad_norm": 3.75, "grad_norm_var": 0.0806793212890625, "learning_rate": 0.0001, "loss": 9.4018, "loss/crossentropy": 2.2518192529678345, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3232182711362839, "step": 774 }, { "epoch": 0.0485, "grad_norm": 4.75, "grad_norm_var": 0.0930572509765625, "learning_rate": 0.0001, "loss": 9.6718, "loss/crossentropy": 2.596095561981201, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3486844599246979, "step": 776 }, { "epoch": 0.048625, "grad_norm": 4.125, "grad_norm_var": 0.0822265625, "learning_rate": 0.0001, "loss": 9.6132, "loss/crossentropy": 2.3995965719223022, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3591943085193634, "step": 778 }, { "epoch": 0.04875, "grad_norm": 4.375, "grad_norm_var": 0.07615559895833333, "learning_rate": 0.0001, "loss": 9.664, "loss/crossentropy": 2.212980270385742, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.35707785189151764, "step": 780 }, { "epoch": 0.048875, "grad_norm": 4.21875, "grad_norm_var": 0.0861480712890625, "learning_rate": 0.0001, "loss": 9.7829, "loss/crossentropy": 2.3209805488586426, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.33315178751945496, "step": 782 }, { "epoch": 0.049, "grad_norm": 4.0625, "grad_norm_var": 0.0819976806640625, "learning_rate": 0.0001, "loss": 9.6294, "loss/crossentropy": 2.4062753915786743, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3330836743116379, "step": 784 }, { "epoch": 0.049125, "grad_norm": 4.21875, "grad_norm_var": 0.06301167805989584, "learning_rate": 0.0001, "loss": 9.6967, "loss/crossentropy": 2.3731807470321655, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.37791262567043304, "step": 786 }, { "epoch": 0.04925, "grad_norm": 4.46875, "grad_norm_var": 0.07166239420572916, "learning_rate": 0.0001, "loss": 9.6106, "loss/crossentropy": 2.1310253143310547, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.32675565779209137, "step": 788 }, { "epoch": 0.049375, "grad_norm": 3.828125, "grad_norm_var": 0.06562093098958334, "learning_rate": 0.0001, "loss": 9.5732, "loss/crossentropy": 2.2886255979537964, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.37167444825172424, "step": 790 }, { "epoch": 0.0495, "grad_norm": 4.34375, "grad_norm_var": 0.05289306640625, "learning_rate": 0.0001, "loss": 9.5136, "loss/crossentropy": 2.322494864463806, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3283519744873047, "step": 792 }, { "epoch": 0.049625, "grad_norm": 3.96875, "grad_norm_var": 0.047196451822916666, "learning_rate": 0.0001, "loss": 9.7672, "loss/crossentropy": 2.7288074493408203, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.370631605386734, "step": 794 }, { "epoch": 0.04975, "grad_norm": 4.5, "grad_norm_var": 0.04845377604166667, "learning_rate": 0.0001, "loss": 9.666, "loss/crossentropy": 2.1383297443389893, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.32776103913784027, "step": 796 }, { "epoch": 0.049875, "grad_norm": 4.09375, "grad_norm_var": 0.0453033447265625, "learning_rate": 0.0001, "loss": 9.623, "loss/crossentropy": 2.4972459077835083, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.36165888607501984, "step": 798 }, { "epoch": 0.05, "grad_norm": 4.09375, "grad_norm_var": 0.05042317708333333, "learning_rate": 0.0001, "loss": 9.2915, "loss/crossentropy": 2.19729745388031, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.318468302488327, "step": 800 }, { "epoch": 0.050125, "grad_norm": 4.125, "grad_norm_var": 0.059342447916666666, "learning_rate": 0.0001, "loss": 9.3799, "loss/crossentropy": 2.194010615348816, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.31498509645462036, "step": 802 }, { "epoch": 0.05025, "grad_norm": 3.96875, "grad_norm_var": 0.04267578125, "learning_rate": 0.0001, "loss": 9.5517, "loss/crossentropy": 2.269457697868347, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3178607076406479, "step": 804 }, { "epoch": 0.050375, "grad_norm": 4.40625, "grad_norm_var": 0.0394927978515625, "learning_rate": 0.0001, "loss": 9.2452, "loss/crossentropy": 2.2216137647628784, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3251790404319763, "step": 806 }, { "epoch": 0.0505, "grad_norm": 4.28125, "grad_norm_var": 0.04243876139322917, "learning_rate": 0.0001, "loss": 9.7497, "loss/crossentropy": 2.6958311796188354, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3631015717983246, "step": 808 }, { "epoch": 0.050625, "grad_norm": 4.59375, "grad_norm_var": 0.0543121337890625, "learning_rate": 0.0001, "loss": 9.7743, "loss/crossentropy": 2.6329739093780518, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3427456319332123, "step": 810 }, { "epoch": 0.05075, "grad_norm": 4.21875, "grad_norm_var": 0.0827789306640625, "learning_rate": 0.0001, "loss": 9.861, "loss/crossentropy": 2.41109561920166, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.38177673518657684, "step": 812 }, { "epoch": 0.050875, "grad_norm": 4.125, "grad_norm_var": 0.09693603515625, "learning_rate": 0.0001, "loss": 9.3459, "loss/crossentropy": 2.413679838180542, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3323095142841339, "step": 814 }, { "epoch": 0.051, "grad_norm": 3.96875, "grad_norm_var": 0.10078837076822916, "learning_rate": 0.0001, "loss": 9.2878, "loss/crossentropy": 2.4613407850265503, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.36210502684116364, "step": 816 }, { "epoch": 0.051125, "grad_norm": 4.28125, "grad_norm_var": 0.09621480305989584, "learning_rate": 0.0001, "loss": 9.5978, "loss/crossentropy": 2.3388036489486694, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.33505263924598694, "step": 818 }, { "epoch": 0.05125, "grad_norm": 3.859375, "grad_norm_var": 0.10100911458333334, "learning_rate": 0.0001, "loss": 9.5206, "loss/crossentropy": 2.504610538482666, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.35333533585071564, "step": 820 }, { "epoch": 0.051375, "grad_norm": 4.53125, "grad_norm_var": 0.10614827473958334, "learning_rate": 0.0001, "loss": 9.707, "loss/crossentropy": 2.3531359434127808, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3614940941333771, "step": 822 }, { "epoch": 0.0515, "grad_norm": 3.890625, "grad_norm_var": 0.11204325358072917, "learning_rate": 0.0001, "loss": 9.3598, "loss/crossentropy": 2.0972710251808167, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31728605926036835, "step": 824 }, { "epoch": 0.051625, "grad_norm": 4.1875, "grad_norm_var": 0.09622395833333333, "learning_rate": 0.0001, "loss": 9.3816, "loss/crossentropy": 2.275819420814514, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.34381118416786194, "step": 826 }, { "epoch": 0.05175, "grad_norm": 4.125, "grad_norm_var": 0.04998372395833333, "learning_rate": 0.0001, "loss": 9.6247, "loss/crossentropy": 2.45046067237854, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3419671058654785, "step": 828 }, { "epoch": 0.051875, "grad_norm": 4.53125, "grad_norm_var": 0.05650126139322917, "learning_rate": 0.0001, "loss": 9.8951, "loss/crossentropy": 2.7096316814422607, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.34755463898181915, "step": 830 }, { "epoch": 0.052, "grad_norm": 4.3125, "grad_norm_var": 0.05390523274739583, "learning_rate": 0.0001, "loss": 9.5586, "loss/crossentropy": 2.4081461429595947, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.35109463334083557, "step": 832 }, { "epoch": 0.052125, "grad_norm": 4.25, "grad_norm_var": 0.0544830322265625, "learning_rate": 0.0001, "loss": 9.338, "loss/crossentropy": 2.0195173621177673, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3229655623435974, "step": 834 }, { "epoch": 0.05225, "grad_norm": 4.3125, "grad_norm_var": 0.05168863932291667, "learning_rate": 0.0001, "loss": 9.4718, "loss/crossentropy": 2.262600541114807, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.33351391553878784, "step": 836 }, { "epoch": 0.052375, "grad_norm": 3.9375, "grad_norm_var": 0.05157877604166667, "learning_rate": 0.0001, "loss": 9.5338, "loss/crossentropy": 2.387451410293579, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3783055394887924, "step": 838 }, { "epoch": 0.0525, "grad_norm": 3.796875, "grad_norm_var": 0.04944661458333333, "learning_rate": 0.0001, "loss": 9.5232, "loss/crossentropy": 2.5872695446014404, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.34008149802684784, "step": 840 }, { "epoch": 0.052625, "grad_norm": 3.984375, "grad_norm_var": 0.048990885416666664, "learning_rate": 0.0001, "loss": 9.4675, "loss/crossentropy": 2.7655253410339355, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.340924471616745, "step": 842 }, { "epoch": 0.05275, "grad_norm": 3.859375, "grad_norm_var": 0.050633748372395836, "learning_rate": 0.0001, "loss": 9.2679, "loss/crossentropy": 2.4758448600769043, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.32951878011226654, "step": 844 }, { "epoch": 0.052875, "grad_norm": 4.375, "grad_norm_var": 0.06552632649739583, "learning_rate": 0.0001, "loss": 9.9271, "loss/crossentropy": 2.6429070234298706, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3814462423324585, "step": 846 }, { "epoch": 0.053, "grad_norm": 4.625, "grad_norm_var": 0.08034566243489584, "learning_rate": 0.0001, "loss": 10.0684, "loss/crossentropy": 2.209423542022705, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3319186717271805, "step": 848 }, { "epoch": 0.053125, "grad_norm": 3.96875, "grad_norm_var": 0.0769683837890625, "learning_rate": 0.0001, "loss": 9.5685, "loss/crossentropy": 2.2579764127731323, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3442992717027664, "step": 850 }, { "epoch": 0.05325, "grad_norm": 4.84375, "grad_norm_var": 0.1148834228515625, "learning_rate": 0.0001, "loss": 9.4361, "loss/crossentropy": 2.39439857006073, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3444969654083252, "step": 852 }, { "epoch": 0.053375, "grad_norm": 4.28125, "grad_norm_var": 0.10398661295572917, "learning_rate": 0.0001, "loss": 9.5651, "loss/crossentropy": 2.504552960395813, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.34091413021087646, "step": 854 }, { "epoch": 0.0535, "grad_norm": 4.59375, "grad_norm_var": 0.09475911458333333, "learning_rate": 0.0001, "loss": 9.7258, "loss/crossentropy": 2.4847280979156494, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.3863519877195358, "step": 856 }, { "epoch": 0.053625, "grad_norm": 4.125, "grad_norm_var": 0.08271382649739584, "learning_rate": 0.0001, "loss": 9.5068, "loss/crossentropy": 2.5517282485961914, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.33669134974479675, "step": 858 }, { "epoch": 0.05375, "grad_norm": 3.90625, "grad_norm_var": 0.07314046223958333, "learning_rate": 0.0001, "loss": 9.5156, "loss/crossentropy": 2.450470209121704, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.35644595324993134, "step": 860 }, { "epoch": 0.053875, "grad_norm": 4.40625, "grad_norm_var": 0.0788970947265625, "learning_rate": 0.0001, "loss": 9.7869, "loss/crossentropy": 2.687352776527405, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.33138976991176605, "step": 862 }, { "epoch": 0.054, "grad_norm": 4.59375, "grad_norm_var": 0.1000152587890625, "learning_rate": 0.0001, "loss": 9.412, "loss/crossentropy": 2.5792051553726196, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3428095132112503, "step": 864 }, { "epoch": 0.054125, "grad_norm": 4.09375, "grad_norm_var": 0.10471903483072917, "learning_rate": 0.0001, "loss": 9.4376, "loss/crossentropy": 2.189521312713623, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3646356761455536, "step": 866 }, { "epoch": 0.05425, "grad_norm": 6.4375, "grad_norm_var": 0.37280171712239585, "learning_rate": 0.0001, "loss": 9.9946, "loss/crossentropy": 2.4185925722122192, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.359613835811615, "step": 868 }, { "epoch": 0.054375, "grad_norm": 5.09375, "grad_norm_var": 0.4150299072265625, "learning_rate": 0.0001, "loss": 9.6361, "loss/crossentropy": 2.419649362564087, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.33111467957496643, "step": 870 }, { "epoch": 0.0545, "grad_norm": 5.09375, "grad_norm_var": 0.4413970947265625, "learning_rate": 0.0001, "loss": 9.6636, "loss/crossentropy": 2.4185843467712402, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.35009828209877014, "step": 872 }, { "epoch": 0.054625, "grad_norm": 4.3125, "grad_norm_var": 0.44112040201822916, "learning_rate": 0.0001, "loss": 9.6933, "loss/crossentropy": 2.2782691717147827, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.388755202293396, "step": 874 }, { "epoch": 0.05475, "grad_norm": 3.734375, "grad_norm_var": 0.459912109375, "learning_rate": 0.0001, "loss": 9.7969, "loss/crossentropy": 2.194816470146179, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.32018932700157166, "step": 876 }, { "epoch": 0.054875, "grad_norm": 4.5625, "grad_norm_var": 0.44416402180989584, "learning_rate": 0.0001, "loss": 9.6102, "loss/crossentropy": 2.50557017326355, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3518366515636444, "step": 878 }, { "epoch": 0.055, "grad_norm": 4.78125, "grad_norm_var": 0.42001546223958336, "learning_rate": 0.0001, "loss": 9.7176, "loss/crossentropy": 2.4951841831207275, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3479674905538559, "step": 880 }, { "epoch": 0.055125, "grad_norm": 3.953125, "grad_norm_var": 0.4462636311848958, "learning_rate": 0.0001, "loss": 9.2982, "loss/crossentropy": 2.2794214487075806, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.33856503665447235, "step": 882 }, { "epoch": 0.05525, "grad_norm": 4.4375, "grad_norm_var": 0.20742899576822918, "learning_rate": 0.0001, "loss": 9.5027, "loss/crossentropy": 2.420092821121216, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.32554225623607635, "step": 884 }, { "epoch": 0.055375, "grad_norm": 4.5625, "grad_norm_var": 0.1740386962890625, "learning_rate": 0.0001, "loss": 9.514, "loss/crossentropy": 2.4305249452590942, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3317463994026184, "step": 886 }, { "epoch": 0.0555, "grad_norm": 5.03125, "grad_norm_var": 0.1664215087890625, "learning_rate": 0.0001, "loss": 9.7395, "loss/crossentropy": 2.3821409940719604, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.36465059220790863, "step": 888 }, { "epoch": 0.055625, "grad_norm": 4.4375, "grad_norm_var": 0.14842020670572917, "learning_rate": 0.0001, "loss": 9.2789, "loss/crossentropy": 2.298262596130371, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.31890998780727386, "step": 890 }, { "epoch": 0.05575, "grad_norm": 4.03125, "grad_norm_var": 0.12892252604166668, "learning_rate": 0.0001, "loss": 9.7294, "loss/crossentropy": 2.264374613761902, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.31738966703414917, "step": 892 }, { "epoch": 0.055875, "grad_norm": 4.0625, "grad_norm_var": 0.10719401041666667, "learning_rate": 0.0001, "loss": 9.669, "loss/crossentropy": 2.557625889778137, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.37109945714473724, "step": 894 }, { "epoch": 0.056, "grad_norm": 3.8125, "grad_norm_var": 0.0967437744140625, "learning_rate": 0.0001, "loss": 9.3578, "loss/crossentropy": 2.513554573059082, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3437999337911606, "step": 896 }, { "epoch": 0.056125, "grad_norm": 4.34375, "grad_norm_var": 0.10321858723958334, "learning_rate": 0.0001, "loss": 9.6894, "loss/crossentropy": 2.6883383989334106, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.34091816842556, "step": 898 }, { "epoch": 0.05625, "grad_norm": 4.21875, "grad_norm_var": 0.10778706868489583, "learning_rate": 0.0001, "loss": 9.5283, "loss/crossentropy": 2.5178507566452026, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.31220842897892, "step": 900 }, { "epoch": 0.056375, "grad_norm": 4.09375, "grad_norm_var": 0.11022847493489583, "learning_rate": 0.0001, "loss": 9.4134, "loss/crossentropy": 2.286848306655884, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.36784152686595917, "step": 902 }, { "epoch": 0.0565, "grad_norm": 3.75, "grad_norm_var": 0.08430887858072916, "learning_rate": 0.0001, "loss": 9.5666, "loss/crossentropy": 2.330216407775879, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3340802788734436, "step": 904 }, { "epoch": 0.056625, "grad_norm": 4.15625, "grad_norm_var": 0.07683919270833334, "learning_rate": 0.0001, "loss": 9.4613, "loss/crossentropy": 2.5783122777938843, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3707699775695801, "step": 906 }, { "epoch": 0.05675, "grad_norm": 4.0, "grad_norm_var": 0.07731119791666667, "learning_rate": 0.0001, "loss": 9.6405, "loss/crossentropy": 2.39057457447052, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.31178198754787445, "step": 908 }, { "epoch": 0.056875, "grad_norm": 4.0625, "grad_norm_var": 0.075927734375, "learning_rate": 0.0001, "loss": 9.3792, "loss/crossentropy": 2.2321670055389404, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.39315053820610046, "step": 910 }, { "epoch": 0.057, "grad_norm": 4.125, "grad_norm_var": 0.06603190104166666, "learning_rate": 0.0001, "loss": 9.4016, "loss/crossentropy": 2.457381010055542, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3697053790092468, "step": 912 }, { "epoch": 0.057125, "grad_norm": 4.21875, "grad_norm_var": 0.05308837890625, "learning_rate": 0.0001, "loss": 9.705, "loss/crossentropy": 2.3566343784332275, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.31618085503578186, "step": 914 }, { "epoch": 0.05725, "grad_norm": 3.734375, "grad_norm_var": 0.05917561848958333, "learning_rate": 0.0001, "loss": 9.2448, "loss/crossentropy": 2.349318027496338, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3112690597772598, "step": 916 }, { "epoch": 0.057375, "grad_norm": 5.0625, "grad_norm_var": 0.09866129557291667, "learning_rate": 0.0001, "loss": 9.7381, "loss/crossentropy": 2.605436682701111, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.38693949580192566, "step": 918 }, { "epoch": 0.0575, "grad_norm": 4.25, "grad_norm_var": 0.08850504557291666, "learning_rate": 0.0001, "loss": 9.36, "loss/crossentropy": 2.3533878326416016, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.32768990099430084, "step": 920 }, { "epoch": 0.057625, "grad_norm": 3.71875, "grad_norm_var": 0.13023173014322917, "learning_rate": 0.0001, "loss": 9.4501, "loss/crossentropy": 2.4686715602874756, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.314393013715744, "step": 922 }, { "epoch": 0.05775, "grad_norm": 3.640625, "grad_norm_var": 0.16578776041666668, "learning_rate": 0.0001, "loss": 9.0475, "loss/crossentropy": 2.204137921333313, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30986711382865906, "step": 924 }, { "epoch": 0.057875, "grad_norm": 3.65625, "grad_norm_var": 0.1955230712890625, "learning_rate": 0.0001, "loss": 9.2625, "loss/crossentropy": 2.505138397216797, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3187219649553299, "step": 926 }, { "epoch": 0.058, "grad_norm": 4.09375, "grad_norm_var": 0.19621988932291667, "learning_rate": 0.0001, "loss": 9.2882, "loss/crossentropy": 2.4183324575424194, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3302215486764908, "step": 928 }, { "epoch": 0.058125, "grad_norm": 4.03125, "grad_norm_var": 0.179736328125, "learning_rate": 0.0001, "loss": 9.5348, "loss/crossentropy": 2.4528021812438965, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.34695957601070404, "step": 930 }, { "epoch": 0.05825, "grad_norm": 4.1875, "grad_norm_var": 0.18042704264322917, "learning_rate": 0.0001, "loss": 9.3228, "loss/crossentropy": 2.2103404998779297, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3665204644203186, "step": 932 }, { "epoch": 0.058375, "grad_norm": 4.15625, "grad_norm_var": 0.10943094889322917, "learning_rate": 0.0001, "loss": 9.3404, "loss/crossentropy": 2.180467367172241, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.299451008439064, "step": 934 }, { "epoch": 0.0585, "grad_norm": 3.703125, "grad_norm_var": 0.11096598307291666, "learning_rate": 0.0001, "loss": 9.3411, "loss/crossentropy": 2.7028924226760864, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.35058994591236115, "step": 936 }, { "epoch": 0.058625, "grad_norm": 4.0, "grad_norm_var": 0.04487202962239583, "learning_rate": 0.0001, "loss": 9.3285, "loss/crossentropy": 2.4909303188323975, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.33308878540992737, "step": 938 }, { "epoch": 0.05875, "grad_norm": 3.90625, "grad_norm_var": 0.03717041015625, "learning_rate": 0.0001, "loss": 9.4385, "loss/crossentropy": 2.3014419078826904, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3322508633136749, "step": 940 }, { "epoch": 0.058875, "grad_norm": 4.09375, "grad_norm_var": 0.0290435791015625, "learning_rate": 0.0001, "loss": 9.5862, "loss/crossentropy": 2.5005375146865845, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.34335020184516907, "step": 942 }, { "epoch": 0.059, "grad_norm": 5.0, "grad_norm_var": 0.10372721354166667, "learning_rate": 0.0001, "loss": 9.4993, "loss/crossentropy": 2.428452491760254, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3558335155248642, "step": 944 }, { "epoch": 0.059125, "grad_norm": 4.46875, "grad_norm_var": 0.11492513020833334, "learning_rate": 0.0001, "loss": 9.3561, "loss/crossentropy": 2.450140953063965, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.32736936211586, "step": 946 }, { "epoch": 0.05925, "grad_norm": 3.609375, "grad_norm_var": 0.12048238118489583, "learning_rate": 0.0001, "loss": 9.4383, "loss/crossentropy": 2.4876564741134644, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.31698183715343475, "step": 948 }, { "epoch": 0.059375, "grad_norm": 4.21875, "grad_norm_var": 0.12451883951822916, "learning_rate": 0.0001, "loss": 9.6831, "loss/crossentropy": 2.384592890739441, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.32197698950767517, "step": 950 }, { "epoch": 0.0595, "grad_norm": 3.59375, "grad_norm_var": 0.12683919270833333, "learning_rate": 0.0001, "loss": 9.2944, "loss/crossentropy": 2.35392427444458, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.36352527141571045, "step": 952 }, { "epoch": 0.059625, "grad_norm": 4.0625, "grad_norm_var": 0.11980794270833334, "learning_rate": 0.0001, "loss": 9.0958, "loss/crossentropy": 2.4466415643692017, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30798208713531494, "step": 954 }, { "epoch": 0.05975, "grad_norm": 4.375, "grad_norm_var": 0.11995442708333333, "learning_rate": 0.0001, "loss": 9.4957, "loss/crossentropy": 2.5927644968032837, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.33347761631011963, "step": 956 }, { "epoch": 0.059875, "grad_norm": 4.3125, "grad_norm_var": 0.12353413899739583, "learning_rate": 0.0001, "loss": 9.3342, "loss/crossentropy": 2.4658687114715576, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3318018615245819, "step": 958 }, { "epoch": 0.06, "grad_norm": 4.28125, "grad_norm_var": 0.06822001139322917, "learning_rate": 0.0001, "loss": 9.4473, "loss/crossentropy": 2.4019787311553955, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3761335462331772, "step": 960 }, { "epoch": 0.060125, "grad_norm": 4.46875, "grad_norm_var": 0.06902567545572917, "learning_rate": 0.0001, "loss": 9.4095, "loss/crossentropy": 2.6830371618270874, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.32209448516368866, "step": 962 }, { "epoch": 0.06025, "grad_norm": 3.875, "grad_norm_var": 0.059716796875, "learning_rate": 0.0001, "loss": 9.3998, "loss/crossentropy": 2.283499240875244, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3431689292192459, "step": 964 }, { "epoch": 0.060375, "grad_norm": 3.8125, "grad_norm_var": 0.0599761962890625, "learning_rate": 0.0001, "loss": 9.12, "loss/crossentropy": 2.146597146987915, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.32681937515735626, "step": 966 }, { "epoch": 0.0605, "grad_norm": 3.796875, "grad_norm_var": 0.048127237955729166, "learning_rate": 0.0001, "loss": 9.2878, "loss/crossentropy": 2.3824340105056763, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.32292675971984863, "step": 968 }, { "epoch": 0.060625, "grad_norm": 3.875, "grad_norm_var": 0.060465494791666664, "learning_rate": 0.0001, "loss": 9.1892, "loss/crossentropy": 2.3321211338043213, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3173847645521164, "step": 970 }, { "epoch": 0.06075, "grad_norm": 4.09375, "grad_norm_var": 0.05028889973958333, "learning_rate": 0.0001, "loss": 9.5526, "loss/crossentropy": 2.5641666650772095, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.35992586612701416, "step": 972 }, { "epoch": 0.060875, "grad_norm": 3.984375, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 9.3705, "loss/crossentropy": 2.34674608707428, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.32495957612991333, "step": 974 }, { "epoch": 0.061, "grad_norm": 4.03125, "grad_norm_var": 0.065625, "learning_rate": 0.0001, "loss": 9.3391, "loss/crossentropy": 2.447916865348816, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3161363750696182, "step": 976 }, { "epoch": 0.061125, "grad_norm": 3.796875, "grad_norm_var": 0.0483062744140625, "learning_rate": 0.0001, "loss": 9.3675, "loss/crossentropy": 2.361741304397583, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.30326806008815765, "step": 978 }, { "epoch": 0.06125, "grad_norm": 3.90625, "grad_norm_var": 0.05650126139322917, "learning_rate": 0.0001, "loss": 9.3554, "loss/crossentropy": 2.4683319330215454, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.34657470881938934, "step": 980 }, { "epoch": 0.061375, "grad_norm": 3.828125, "grad_norm_var": 0.05673421223958333, "learning_rate": 0.0001, "loss": 9.229, "loss/crossentropy": 2.26959490776062, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3553328216075897, "step": 982 }, { "epoch": 0.0615, "grad_norm": 4.03125, "grad_norm_var": 0.0560455322265625, "learning_rate": 0.0001, "loss": 9.2883, "loss/crossentropy": 2.5307698249816895, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.31306394934654236, "step": 984 }, { "epoch": 0.061625, "grad_norm": 5.0625, "grad_norm_var": 0.16921284993489583, "learning_rate": 0.0001, "loss": 9.4596, "loss/crossentropy": 2.325950264930725, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3328956216573715, "step": 986 }, { "epoch": 0.06175, "grad_norm": 3.6875, "grad_norm_var": 0.18103739420572917, "learning_rate": 0.0001, "loss": 9.2764, "loss/crossentropy": 2.3399884700775146, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3336493968963623, "step": 988 }, { "epoch": 0.061875, "grad_norm": 3.84375, "grad_norm_var": 0.18277079264322918, "learning_rate": 0.0001, "loss": 9.2997, "loss/crossentropy": 2.4344476461410522, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.31361398100852966, "step": 990 }, { "epoch": 0.062, "grad_norm": 3.984375, "grad_norm_var": 0.17678629557291667, "learning_rate": 0.0001, "loss": 9.4531, "loss/crossentropy": 2.532125949859619, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.31050997972488403, "step": 992 }, { "epoch": 0.062125, "grad_norm": 5.625, "grad_norm_var": 0.3181711832682292, "learning_rate": 0.0001, "loss": 9.3085, "loss/crossentropy": 2.205925226211548, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3132568746805191, "step": 994 }, { "epoch": 0.06225, "grad_norm": 4.5, "grad_norm_var": 0.29087626139322914, "learning_rate": 0.0001, "loss": 9.4214, "loss/crossentropy": 2.3645405769348145, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.33257976174354553, "step": 996 }, { "epoch": 0.062375, "grad_norm": 4.125, "grad_norm_var": 0.2752349853515625, "learning_rate": 0.0001, "loss": 9.248, "loss/crossentropy": 2.6228253841400146, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3128903806209564, "step": 998 }, { "epoch": 0.0625, "grad_norm": 3.828125, "grad_norm_var": 0.2778472900390625, "learning_rate": 0.0001, "loss": 9.3876, "loss/crossentropy": 2.4406707286834717, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3451492637395859, "step": 1000 }, { "epoch": 0.062625, "grad_norm": 3.625, "grad_norm_var": 0.22431233723958333, "learning_rate": 0.0001, "loss": 9.3272, "loss/crossentropy": 2.1737005710601807, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.2996635288000107, "step": 1002 }, { "epoch": 0.06275, "grad_norm": 3.671875, "grad_norm_var": 0.22522684733072917, "learning_rate": 0.0001, "loss": 9.1276, "loss/crossentropy": 2.3624621629714966, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.32320792973041534, "step": 1004 }, { "epoch": 0.062875, "grad_norm": 3.84375, "grad_norm_var": 0.22323811848958333, "learning_rate": 0.0001, "loss": 9.1599, "loss/crossentropy": 2.163053512573242, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3028685748577118, "step": 1006 }, { "epoch": 0.063, "grad_norm": 3.75, "grad_norm_var": 0.21956278483072916, "learning_rate": 0.0001, "loss": 9.0966, "loss/crossentropy": 2.3403860330581665, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31008927524089813, "step": 1008 }, { "epoch": 0.063125, "grad_norm": 4.25, "grad_norm_var": 0.052098592122395836, "learning_rate": 0.0001, "loss": 9.3055, "loss/crossentropy": 2.5604758262634277, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3352798819541931, "step": 1010 }, { "epoch": 0.06325, "grad_norm": 4.0, "grad_norm_var": 0.0339263916015625, "learning_rate": 0.0001, "loss": 9.2584, "loss/crossentropy": 2.5583336353302, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.323485866189003, "step": 1012 }, { "epoch": 0.063375, "grad_norm": 3.53125, "grad_norm_var": 0.042626953125, "learning_rate": 0.0001, "loss": 9.2338, "loss/crossentropy": 2.4750031232833862, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.32088278234004974, "step": 1014 }, { "epoch": 0.0635, "grad_norm": 4.28125, "grad_norm_var": 0.048005167643229166, "learning_rate": 0.0001, "loss": 9.4657, "loss/crossentropy": 2.3103621006011963, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3347797989845276, "step": 1016 }, { "epoch": 0.063625, "grad_norm": 3.890625, "grad_norm_var": 0.04117431640625, "learning_rate": 0.0001, "loss": 9.2284, "loss/crossentropy": 2.281406044960022, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.33119070529937744, "step": 1018 }, { "epoch": 0.06375, "grad_norm": 3.671875, "grad_norm_var": 0.043822224934895834, "learning_rate": 0.0001, "loss": 9.1061, "loss/crossentropy": 2.3903090953826904, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.30517514050006866, "step": 1020 }, { "epoch": 0.063875, "grad_norm": 3.828125, "grad_norm_var": 0.04397786458333333, "learning_rate": 0.0001, "loss": 9.3342, "loss/crossentropy": 2.4621089696884155, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3505241721868515, "step": 1022 }, { "epoch": 0.064, "grad_norm": 3.734375, "grad_norm_var": 0.051813761393229164, "learning_rate": 0.0001, "loss": 9.1248, "loss/crossentropy": 2.3870365619659424, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3047706037759781, "step": 1024 }, { "epoch": 0.064125, "grad_norm": 4.09375, "grad_norm_var": 0.053564453125, "learning_rate": 0.0001, "loss": 9.1846, "loss/crossentropy": 2.7421722412109375, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3265855461359024, "step": 1026 }, { "epoch": 0.06425, "grad_norm": 3.703125, "grad_norm_var": 0.0537261962890625, "learning_rate": 0.0001, "loss": 9.2907, "loss/crossentropy": 2.296812057495117, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.2955050766468048, "step": 1028 }, { "epoch": 0.064375, "grad_norm": 4.09375, "grad_norm_var": 0.0503570556640625, "learning_rate": 0.0001, "loss": 9.3552, "loss/crossentropy": 2.6835397481918335, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.33152663707733154, "step": 1030 }, { "epoch": 0.0645, "grad_norm": 4.15625, "grad_norm_var": 0.0430084228515625, "learning_rate": 0.0001, "loss": 9.3363, "loss/crossentropy": 2.644715666770935, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3512026369571686, "step": 1032 }, { "epoch": 0.064625, "grad_norm": 4.15625, "grad_norm_var": 0.0596343994140625, "learning_rate": 0.0001, "loss": 9.4068, "loss/crossentropy": 2.424636483192444, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3079882860183716, "step": 1034 }, { "epoch": 0.06475, "grad_norm": 3.96875, "grad_norm_var": 0.05446675618489583, "learning_rate": 0.0001, "loss": 9.4486, "loss/crossentropy": 2.347719192504883, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3452465981245041, "step": 1036 }, { "epoch": 0.064875, "grad_norm": 3.765625, "grad_norm_var": 0.07506103515625, "learning_rate": 0.0001, "loss": 8.9964, "loss/crossentropy": 2.0868254899978638, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3242860585451126, "step": 1038 }, { "epoch": 0.065, "grad_norm": 5.40625, "grad_norm_var": 0.21399637858072917, "learning_rate": 0.0001, "loss": 9.4657, "loss/crossentropy": 2.4020437002182007, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3079642355442047, "step": 1040 }, { "epoch": 0.065125, "grad_norm": 3.921875, "grad_norm_var": 0.19934488932291666, "learning_rate": 0.0001, "loss": 9.4337, "loss/crossentropy": 2.4271044731140137, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3291940689086914, "step": 1042 }, { "epoch": 0.06525, "grad_norm": 3.765625, "grad_norm_var": 0.19329020182291667, "learning_rate": 0.0001, "loss": 9.196, "loss/crossentropy": 2.3336949348449707, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3126966655254364, "step": 1044 }, { "epoch": 0.065375, "grad_norm": 3.6875, "grad_norm_var": 0.19695638020833334, "learning_rate": 0.0001, "loss": 9.4016, "loss/crossentropy": 2.5471415519714355, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.33399492502212524, "step": 1046 }, { "epoch": 0.0655, "grad_norm": 3.984375, "grad_norm_var": 0.19744364420572916, "learning_rate": 0.0001, "loss": 9.0525, "loss/crossentropy": 2.028559982776642, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3023695796728134, "step": 1048 }, { "epoch": 0.065625, "grad_norm": 3.9375, "grad_norm_var": 0.1896392822265625, "learning_rate": 0.0001, "loss": 9.2038, "loss/crossentropy": 2.2506083250045776, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3274885416030884, "step": 1050 }, { "epoch": 0.06575, "grad_norm": 4.0, "grad_norm_var": 0.18593343098958334, "learning_rate": 0.0001, "loss": 9.3255, "loss/crossentropy": 2.6331071853637695, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30817919969558716, "step": 1052 }, { "epoch": 0.065875, "grad_norm": 3.875, "grad_norm_var": 0.15797526041666668, "learning_rate": 0.0001, "loss": 9.2451, "loss/crossentropy": 2.354863405227661, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.33920133113861084, "step": 1054 }, { "epoch": 0.066, "grad_norm": 3.90625, "grad_norm_var": 0.015315755208333334, "learning_rate": 0.0001, "loss": 9.3564, "loss/crossentropy": 2.6202335357666016, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3073268234729767, "step": 1056 }, { "epoch": 0.066125, "grad_norm": 3.796875, "grad_norm_var": 0.0164703369140625, "learning_rate": 0.0001, "loss": 9.1658, "loss/crossentropy": 2.302557349205017, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.31188826262950897, "step": 1058 }, { "epoch": 0.06625, "grad_norm": 3.453125, "grad_norm_var": 0.027293904622395834, "learning_rate": 0.0001, "loss": 9.131, "loss/crossentropy": 2.514571189880371, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3184218853712082, "step": 1060 }, { "epoch": 0.066375, "grad_norm": 3.859375, "grad_norm_var": 0.025162760416666666, "learning_rate": 0.0001, "loss": 9.2056, "loss/crossentropy": 2.264451503753662, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.34064817428588867, "step": 1062 }, { "epoch": 0.0665, "grad_norm": 3.9375, "grad_norm_var": 0.018180338541666667, "learning_rate": 0.0001, "loss": 9.0521, "loss/crossentropy": 2.342800498008728, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.33910292387008667, "step": 1064 }, { "epoch": 0.066625, "grad_norm": 3.90625, "grad_norm_var": 0.021320597330729166, "learning_rate": 0.0001, "loss": 9.3295, "loss/crossentropy": 2.5191909074783325, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.34528471529483795, "step": 1066 }, { "epoch": 0.06675, "grad_norm": 3.90625, "grad_norm_var": 0.020930989583333334, "learning_rate": 0.0001, "loss": 9.2792, "loss/crossentropy": 2.6589291095733643, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3547402173280716, "step": 1068 }, { "epoch": 0.066875, "grad_norm": 3.59375, "grad_norm_var": 0.023167928059895832, "learning_rate": 0.0001, "loss": 9.0386, "loss/crossentropy": 2.1663339138031006, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3398684561252594, "step": 1070 }, { "epoch": 0.067, "grad_norm": 4.46875, "grad_norm_var": 0.06620992024739583, "learning_rate": 0.0001, "loss": 9.538, "loss/crossentropy": 2.518619418144226, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.33177968859672546, "step": 1072 }, { "epoch": 0.067125, "grad_norm": 3.9375, "grad_norm_var": 0.06520894368489584, "learning_rate": 0.0001, "loss": 9.4663, "loss/crossentropy": 2.564071536064148, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3523600995540619, "step": 1074 }, { "epoch": 0.06725, "grad_norm": 3.65625, "grad_norm_var": 0.056868489583333334, "learning_rate": 0.0001, "loss": 9.1435, "loss/crossentropy": 2.314103364944458, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3031466454267502, "step": 1076 }, { "epoch": 0.067375, "grad_norm": 3.96875, "grad_norm_var": 0.0743072509765625, "learning_rate": 0.0001, "loss": 9.0273, "loss/crossentropy": 2.397694706916809, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.293083518743515, "step": 1078 }, { "epoch": 0.0675, "grad_norm": 4.65625, "grad_norm_var": 0.11435139973958333, "learning_rate": 0.0001, "loss": 9.6091, "loss/crossentropy": 2.3738266229629517, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.35242393612861633, "step": 1080 }, { "epoch": 0.067625, "grad_norm": 3.953125, "grad_norm_var": 0.1121490478515625, "learning_rate": 0.0001, "loss": 9.3609, "loss/crossentropy": 2.5533446073532104, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3422684669494629, "step": 1082 }, { "epoch": 0.06775, "grad_norm": 3.78125, "grad_norm_var": 0.10871988932291667, "learning_rate": 0.0001, "loss": 9.3206, "loss/crossentropy": 2.405779242515564, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.310599148273468, "step": 1084 }, { "epoch": 0.067875, "grad_norm": 3.953125, "grad_norm_var": 0.10204976399739583, "learning_rate": 0.0001, "loss": 9.0713, "loss/crossentropy": 2.2090498208999634, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3123241662979126, "step": 1086 }, { "epoch": 0.068, "grad_norm": 3.75, "grad_norm_var": 0.0724761962890625, "learning_rate": 0.0001, "loss": 9.0476, "loss/crossentropy": 2.280885696411133, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.2895790636539459, "step": 1088 }, { "epoch": 0.068125, "grad_norm": 3.78125, "grad_norm_var": 0.0732330322265625, "learning_rate": 0.0001, "loss": 9.2207, "loss/crossentropy": 2.337521195411682, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3371659815311432, "step": 1090 }, { "epoch": 0.06825, "grad_norm": 3.578125, "grad_norm_var": 0.08389383951822917, "learning_rate": 0.0001, "loss": 8.9717, "loss/crossentropy": 2.358444333076477, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.34108731150627136, "step": 1092 }, { "epoch": 0.068375, "grad_norm": 4.09375, "grad_norm_var": 0.07141520182291666, "learning_rate": 0.0001, "loss": 9.0686, "loss/crossentropy": 2.2623904943466187, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3080063462257385, "step": 1094 }, { "epoch": 0.0685, "grad_norm": 4.03125, "grad_norm_var": 0.029084269205729166, "learning_rate": 0.0001, "loss": 9.4291, "loss/crossentropy": 2.2225699424743652, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.28325292468070984, "step": 1096 }, { "epoch": 0.068625, "grad_norm": 3.6875, "grad_norm_var": 0.02789306640625, "learning_rate": 0.0001, "loss": 9.258, "loss/crossentropy": 2.362979292869568, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3146945536136627, "step": 1098 }, { "epoch": 0.06875, "grad_norm": 3.9375, "grad_norm_var": 0.035008748372395836, "learning_rate": 0.0001, "loss": 9.0957, "loss/crossentropy": 2.3709558248519897, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.2997266799211502, "step": 1100 }, { "epoch": 0.068875, "grad_norm": 3.421875, "grad_norm_var": 0.049637858072916666, "learning_rate": 0.0001, "loss": 8.8482, "loss/crossentropy": 2.1995412707328796, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.31232061982154846, "step": 1102 }, { "epoch": 0.069, "grad_norm": 4.96875, "grad_norm_var": 0.13362630208333334, "learning_rate": 0.0001, "loss": 9.3721, "loss/crossentropy": 2.179778814315796, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.29335537552833557, "step": 1104 }, { "epoch": 0.069125, "grad_norm": 4.3125, "grad_norm_var": 0.35261128743489584, "learning_rate": 0.0001, "loss": 9.3248, "loss/crossentropy": 2.369896650314331, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3417474627494812, "step": 1106 }, { "epoch": 0.06925, "grad_norm": 4.21875, "grad_norm_var": 0.32355143229166666, "learning_rate": 0.0001, "loss": 9.422, "loss/crossentropy": 2.6357239484786987, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.33385856449604034, "step": 1108 }, { "epoch": 0.069375, "grad_norm": 4.1875, "grad_norm_var": 0.33131103515625, "learning_rate": 0.0001, "loss": 9.3892, "loss/crossentropy": 2.6354408264160156, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.33535902202129364, "step": 1110 }, { "epoch": 0.0695, "grad_norm": 3.859375, "grad_norm_var": 0.3421712239583333, "learning_rate": 0.0001, "loss": 9.3288, "loss/crossentropy": 2.2603734731674194, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.31547120213508606, "step": 1112 }, { "epoch": 0.069625, "grad_norm": 3.625, "grad_norm_var": 0.3455556233723958, "learning_rate": 0.0001, "loss": 9.0927, "loss/crossentropy": 2.2856796979904175, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.31341737508773804, "step": 1114 }, { "epoch": 0.06975, "grad_norm": 3.765625, "grad_norm_var": 0.3824208577473958, "learning_rate": 0.0001, "loss": 9.065, "loss/crossentropy": 2.3744817972183228, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3295013904571533, "step": 1116 }, { "epoch": 0.069875, "grad_norm": 3.953125, "grad_norm_var": 0.33152567545572914, "learning_rate": 0.0001, "loss": 9.286, "loss/crossentropy": 2.4832775592803955, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3211631774902344, "step": 1118 }, { "epoch": 0.07, "grad_norm": 3.640625, "grad_norm_var": 0.2992177327473958, "learning_rate": 0.0001, "loss": 9.2446, "loss/crossentropy": 2.6656835079193115, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.33588290214538574, "step": 1120 }, { "epoch": 0.070125, "grad_norm": 4.125, "grad_norm_var": 0.09185282389322917, "learning_rate": 0.0001, "loss": 9.1868, "loss/crossentropy": 2.418588876724243, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.36348558962345123, "step": 1122 }, { "epoch": 0.07025, "grad_norm": 4.09375, "grad_norm_var": 0.08717041015625, "learning_rate": 0.0001, "loss": 9.1917, "loss/crossentropy": 2.309618353843689, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2983853369951248, "step": 1124 }, { "epoch": 0.070375, "grad_norm": 4.03125, "grad_norm_var": 0.0684478759765625, "learning_rate": 0.0001, "loss": 9.0934, "loss/crossentropy": 2.466736316680908, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.31100137531757355, "step": 1126 }, { "epoch": 0.0705, "grad_norm": 3.984375, "grad_norm_var": 0.10739644368489583, "learning_rate": 0.0001, "loss": 9.3913, "loss/crossentropy": 2.4813402891159058, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.32681819796562195, "step": 1128 }, { "epoch": 0.070625, "grad_norm": 3.53125, "grad_norm_var": 0.1125152587890625, "learning_rate": 0.0001, "loss": 9.2876, "loss/crossentropy": 2.6551177501678467, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.30686071515083313, "step": 1130 }, { "epoch": 0.07075, "grad_norm": 3.859375, "grad_norm_var": 0.10204671223958334, "learning_rate": 0.0001, "loss": 9.164, "loss/crossentropy": 2.266343593597412, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.29779092967510223, "step": 1132 }, { "epoch": 0.070875, "grad_norm": 4.3125, "grad_norm_var": 0.11245829264322917, "learning_rate": 0.0001, "loss": 9.2955, "loss/crossentropy": 2.3365002870559692, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3126705288887024, "step": 1134 }, { "epoch": 0.071, "grad_norm": 3.90625, "grad_norm_var": 0.1045318603515625, "learning_rate": 0.0001, "loss": 9.0178, "loss/crossentropy": 2.469061851501465, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.32917141914367676, "step": 1136 }, { "epoch": 0.071125, "grad_norm": 3.5, "grad_norm_var": 0.12392171223958333, "learning_rate": 0.0001, "loss": 8.9858, "loss/crossentropy": 2.5383065938949585, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.31508152186870575, "step": 1138 }, { "epoch": 0.07125, "grad_norm": 3.703125, "grad_norm_var": 0.1242828369140625, "learning_rate": 0.0001, "loss": 9.3816, "loss/crossentropy": 2.7282421588897705, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.329195499420166, "step": 1140 }, { "epoch": 0.071375, "grad_norm": 4.125, "grad_norm_var": 0.1225982666015625, "learning_rate": 0.0001, "loss": 9.5541, "loss/crossentropy": 2.4712259769439697, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3016209304332733, "step": 1142 }, { "epoch": 0.0715, "grad_norm": 3.9375, "grad_norm_var": 0.057291666666666664, "learning_rate": 0.0001, "loss": 9.1036, "loss/crossentropy": 2.4540599584579468, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.31999677419662476, "step": 1144 }, { "epoch": 0.071625, "grad_norm": 4.21875, "grad_norm_var": 0.053465779622395834, "learning_rate": 0.0001, "loss": 9.1462, "loss/crossentropy": 2.808298349380493, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3199266195297241, "step": 1146 }, { "epoch": 0.07175, "grad_norm": 3.796875, "grad_norm_var": 0.06948954264322917, "learning_rate": 0.0001, "loss": 9.3125, "loss/crossentropy": 2.5488197803497314, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.34709227085113525, "step": 1148 }, { "epoch": 0.071875, "grad_norm": 3.328125, "grad_norm_var": 0.08771158854166666, "learning_rate": 0.0001, "loss": 8.9519, "loss/crossentropy": 2.3145110607147217, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3027106672525406, "step": 1150 }, { "epoch": 0.072, "grad_norm": 5.375, "grad_norm_var": 0.23205973307291666, "learning_rate": 0.0001, "loss": 9.251, "loss/crossentropy": 2.2240471839904785, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.336179718375206, "step": 1152 }, { "epoch": 0.072125, "grad_norm": 3.90625, "grad_norm_var": 0.21687723795572916, "learning_rate": 0.0001, "loss": 9.1314, "loss/crossentropy": 2.105097532272339, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3255419135093689, "step": 1154 }, { "epoch": 0.07225, "grad_norm": 4.03125, "grad_norm_var": 0.21018778483072917, "learning_rate": 0.0001, "loss": 9.0524, "loss/crossentropy": 2.4139981269836426, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3454781472682953, "step": 1156 }, { "epoch": 0.072375, "grad_norm": 3.78125, "grad_norm_var": 0.21177469889322917, "learning_rate": 0.0001, "loss": 9.1533, "loss/crossentropy": 2.3245939016342163, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.28686030209064484, "step": 1158 }, { "epoch": 0.0725, "grad_norm": 3.765625, "grad_norm_var": 0.22030843098958333, "learning_rate": 0.0001, "loss": 9.2865, "loss/crossentropy": 2.5649216175079346, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3485357314348221, "step": 1160 }, { "epoch": 0.072625, "grad_norm": 3.859375, "grad_norm_var": 0.21592508951822917, "learning_rate": 0.0001, "loss": 9.4127, "loss/crossentropy": 2.467991352081299, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3160312622785568, "step": 1162 }, { "epoch": 0.07275, "grad_norm": 3.796875, "grad_norm_var": 0.20200907389322917, "learning_rate": 0.0001, "loss": 9.208, "loss/crossentropy": 2.4666264057159424, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.32405731081962585, "step": 1164 }, { "epoch": 0.072875, "grad_norm": 3.453125, "grad_norm_var": 0.17566630045572917, "learning_rate": 0.0001, "loss": 8.9344, "loss/crossentropy": 2.497612714767456, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2962990552186966, "step": 1166 }, { "epoch": 0.073, "grad_norm": 4.21875, "grad_norm_var": 0.07573140462239583, "learning_rate": 0.0001, "loss": 9.2981, "loss/crossentropy": 2.3695040941238403, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.321966215968132, "step": 1168 }, { "epoch": 0.073125, "grad_norm": 3.984375, "grad_norm_var": 0.07278645833333333, "learning_rate": 0.0001, "loss": 9.348, "loss/crossentropy": 2.399674415588379, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3187423199415207, "step": 1170 }, { "epoch": 0.07325, "grad_norm": 4.09375, "grad_norm_var": 0.07532552083333334, "learning_rate": 0.0001, "loss": 9.2577, "loss/crossentropy": 2.395334005355835, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3417189121246338, "step": 1172 }, { "epoch": 0.073375, "grad_norm": 3.5, "grad_norm_var": 0.08810933430989583, "learning_rate": 0.0001, "loss": 8.7726, "loss/crossentropy": 2.0574229955673218, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30047091841697693, "step": 1174 }, { "epoch": 0.0735, "grad_norm": 3.59375, "grad_norm_var": 0.08876953125, "learning_rate": 0.0001, "loss": 9.1649, "loss/crossentropy": 2.3426761627197266, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.31436459720134735, "step": 1176 }, { "epoch": 0.073625, "grad_norm": 4.03125, "grad_norm_var": 0.0902984619140625, "learning_rate": 0.0001, "loss": 9.3917, "loss/crossentropy": 2.4197838306427, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.30905722081661224, "step": 1178 }, { "epoch": 0.07375, "grad_norm": 4.6875, "grad_norm_var": 0.12220052083333334, "learning_rate": 0.0001, "loss": 9.1579, "loss/crossentropy": 2.3440288305282593, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3073788434267044, "step": 1180 }, { "epoch": 0.073875, "grad_norm": 4.0625, "grad_norm_var": 0.10395406087239584, "learning_rate": 0.0001, "loss": 9.3372, "loss/crossentropy": 2.3033924102783203, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.34168997406959534, "step": 1182 }, { "epoch": 0.074, "grad_norm": 4.25, "grad_norm_var": 0.0841705322265625, "learning_rate": 0.0001, "loss": 9.4208, "loss/crossentropy": 2.6207507848739624, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.34534966945648193, "step": 1184 }, { "epoch": 0.074125, "grad_norm": 3.78125, "grad_norm_var": 0.08531494140625, "learning_rate": 0.0001, "loss": 9.286, "loss/crossentropy": 2.4476726055145264, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3093183934688568, "step": 1186 }, { "epoch": 0.07425, "grad_norm": 5.28125, "grad_norm_var": 0.19589742024739584, "learning_rate": 0.0001, "loss": 9.0637, "loss/crossentropy": 2.2181872725486755, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.32876770198345184, "step": 1188 }, { "epoch": 0.074375, "grad_norm": 8.75, "grad_norm_var": 1.5139719645182292, "learning_rate": 0.0001, "loss": 9.1797, "loss/crossentropy": 2.1833176612854004, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.36219629645347595, "step": 1190 }, { "epoch": 0.0745, "grad_norm": 3.578125, "grad_norm_var": 1.5350494384765625, "learning_rate": 0.0001, "loss": 9.0629, "loss/crossentropy": 2.405817151069641, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.31212493777275085, "step": 1192 }, { "epoch": 0.074625, "grad_norm": 4.28125, "grad_norm_var": 1.5252675374348958, "learning_rate": 0.0001, "loss": 9.0899, "loss/crossentropy": 2.432392120361328, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.323747381567955, "step": 1194 }, { "epoch": 0.07475, "grad_norm": 4.1875, "grad_norm_var": 1.516307576497396, "learning_rate": 0.0001, "loss": 9.4802, "loss/crossentropy": 2.60243022441864, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3159172534942627, "step": 1196 }, { "epoch": 0.074875, "grad_norm": 4.0625, "grad_norm_var": 1.518024698893229, "learning_rate": 0.0001, "loss": 9.1813, "loss/crossentropy": 2.2708317041397095, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3217354714870453, "step": 1198 }, { "epoch": 0.075, "grad_norm": 3.53125, "grad_norm_var": 1.5860260009765625, "learning_rate": 0.0001, "loss": 8.8086, "loss/crossentropy": 2.2433842420578003, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2955727279186249, "step": 1200 }, { "epoch": 0.075125, "grad_norm": 4.0625, "grad_norm_var": 1.5884429931640625, "learning_rate": 0.0001, "loss": 8.9904, "loss/crossentropy": 2.544836163520813, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3101474642753601, "step": 1202 }, { "epoch": 0.07525, "grad_norm": 3.984375, "grad_norm_var": 1.5440388997395833, "learning_rate": 0.0001, "loss": 9.4, "loss/crossentropy": 2.546027898788452, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3451322615146637, "step": 1204 }, { "epoch": 0.075375, "grad_norm": 3.78125, "grad_norm_var": 0.1521484375, "learning_rate": 0.0001, "loss": 9.0253, "loss/crossentropy": 2.4012279510498047, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.290866881608963, "step": 1206 }, { "epoch": 0.0755, "grad_norm": 5.15625, "grad_norm_var": 18.180106608072915, "learning_rate": 0.0001, "loss": 10.0656, "loss/crossentropy": 2.4804184436798096, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31466802954673767, "step": 1208 }, { "epoch": 0.075625, "grad_norm": 3.421875, "grad_norm_var": 18.290657552083335, "learning_rate": 0.0001, "loss": 8.942, "loss/crossentropy": 2.104141592979431, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.3016415685415268, "step": 1210 }, { "epoch": 0.07575, "grad_norm": 3.53125, "grad_norm_var": 18.478189086914064, "learning_rate": 0.0001, "loss": 8.7778, "loss/crossentropy": 2.216665744781494, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.30427753925323486, "step": 1212 }, { "epoch": 0.075875, "grad_norm": 3.59375, "grad_norm_var": 18.572997029622396, "learning_rate": 0.0001, "loss": 8.9408, "loss/crossentropy": 2.410394072532654, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3141447752714157, "step": 1214 }, { "epoch": 0.076, "grad_norm": 3.71875, "grad_norm_var": 18.539623006184897, "learning_rate": 0.0001, "loss": 9.0702, "loss/crossentropy": 2.2947787642478943, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3040497452020645, "step": 1216 }, { "epoch": 0.076125, "grad_norm": 3.96875, "grad_norm_var": 18.52271728515625, "learning_rate": 0.0001, "loss": 9.5222, "loss/crossentropy": 2.658992886543274, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.33450958132743835, "step": 1218 }, { "epoch": 0.07625, "grad_norm": 3.59375, "grad_norm_var": 18.640262858072916, "learning_rate": 0.0001, "loss": 9.1006, "loss/crossentropy": 2.382628321647644, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3292655050754547, "step": 1220 }, { "epoch": 0.076375, "grad_norm": 3.859375, "grad_norm_var": 18.59713134765625, "learning_rate": 0.0001, "loss": 8.8435, "loss/crossentropy": 1.980876863002777, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2758803069591522, "step": 1222 }, { "epoch": 0.0765, "grad_norm": 3.640625, "grad_norm_var": 0.031038411458333335, "learning_rate": 0.0001, "loss": 8.8474, "loss/crossentropy": 2.0616570711135864, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.28200867772102356, "step": 1224 }, { "epoch": 0.076625, "grad_norm": 3.734375, "grad_norm_var": 0.021564737955729166, "learning_rate": 0.0001, "loss": 9.1792, "loss/crossentropy": 2.3715614080429077, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31676220893859863, "step": 1226 }, { "epoch": 0.07675, "grad_norm": 3.84375, "grad_norm_var": 0.021971638997395834, "learning_rate": 0.0001, "loss": 9.0379, "loss/crossentropy": 2.5357784032821655, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31404489278793335, "step": 1228 }, { "epoch": 0.076875, "grad_norm": 3.59375, "grad_norm_var": 0.022419230143229166, "learning_rate": 0.0001, "loss": 8.6486, "loss/crossentropy": 2.225548505783081, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.286647230386734, "step": 1230 }, { "epoch": 0.077, "grad_norm": 3.578125, "grad_norm_var": 0.025797526041666668, "learning_rate": 0.0001, "loss": 8.9957, "loss/crossentropy": 2.3057247400283813, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3172541856765747, "step": 1232 }, { "epoch": 0.077125, "grad_norm": 3.578125, "grad_norm_var": 0.023388671875, "learning_rate": 0.0001, "loss": 9.0791, "loss/crossentropy": 2.290403127670288, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.31419865787029266, "step": 1234 }, { "epoch": 0.07725, "grad_norm": 3.46875, "grad_norm_var": 0.021305338541666666, "learning_rate": 0.0001, "loss": 8.9479, "loss/crossentropy": 2.1275144815444946, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.31394386291503906, "step": 1236 }, { "epoch": 0.077375, "grad_norm": 4.0625, "grad_norm_var": 0.05321858723958333, "learning_rate": 0.0001, "loss": 8.9015, "loss/crossentropy": 2.2454686164855957, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2934340536594391, "step": 1238 }, { "epoch": 0.0775, "grad_norm": 3.703125, "grad_norm_var": 0.0609527587890625, "learning_rate": 0.0001, "loss": 8.856, "loss/crossentropy": 2.2809821367263794, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.29852208495140076, "step": 1240 }, { "epoch": 0.077625, "grad_norm": 3.640625, "grad_norm_var": 0.062841796875, "learning_rate": 0.0001, "loss": 9.0546, "loss/crossentropy": 2.2770636081695557, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31510084867477417, "step": 1242 }, { "epoch": 0.07775, "grad_norm": 3.671875, "grad_norm_var": 0.06161702473958333, "learning_rate": 0.0001, "loss": 9.0968, "loss/crossentropy": 2.2599165439605713, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3091094493865967, "step": 1244 }, { "epoch": 0.077875, "grad_norm": 3.703125, "grad_norm_var": 0.05953369140625, "learning_rate": 0.0001, "loss": 8.9473, "loss/crossentropy": 2.0840908885002136, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.29790809750556946, "step": 1246 }, { "epoch": 0.078, "grad_norm": 3.703125, "grad_norm_var": 0.10155843098958334, "learning_rate": 0.0001, "loss": 9.3225, "loss/crossentropy": 2.345468759536743, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.34454044699668884, "step": 1248 }, { "epoch": 0.078125, "grad_norm": 3.515625, "grad_norm_var": 0.10816141764322916, "learning_rate": 0.0001, "loss": 9.1545, "loss/crossentropy": 2.413212776184082, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31984058022499084, "step": 1250 }, { "epoch": 0.07825, "grad_norm": 3.65625, "grad_norm_var": 0.110205078125, "learning_rate": 0.0001, "loss": 8.9687, "loss/crossentropy": 2.4369957447052, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3076692074537277, "step": 1252 }, { "epoch": 0.078375, "grad_norm": 4.59375, "grad_norm_var": 0.12951558430989582, "learning_rate": 0.0001, "loss": 8.9991, "loss/crossentropy": 2.415233612060547, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.29537807404994965, "step": 1254 }, { "epoch": 0.0785, "grad_norm": 7.8125, "grad_norm_var": 1.1356770833333334, "learning_rate": 0.0001, "loss": 9.5869, "loss/crossentropy": 2.3058598041534424, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3160950839519501, "step": 1256 }, { "epoch": 0.078625, "grad_norm": 3.484375, "grad_norm_var": 1.149267578125, "learning_rate": 0.0001, "loss": 9.2005, "loss/crossentropy": 2.412594437599182, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31167350709438324, "step": 1258 }, { "epoch": 0.07875, "grad_norm": 3.34375, "grad_norm_var": 1.1874989827473958, "learning_rate": 0.0001, "loss": 8.9355, "loss/crossentropy": 2.365793824195862, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.30335795879364014, "step": 1260 }, { "epoch": 0.078875, "grad_norm": 6.6875, "grad_norm_var": 1.6226236979166666, "learning_rate": 0.0001, "loss": 9.1613, "loss/crossentropy": 2.261883854866028, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.3325384855270386, "step": 1262 }, { "epoch": 0.079, "grad_norm": 3.828125, "grad_norm_var": 1.6119099934895833, "learning_rate": 0.0001, "loss": 8.9772, "loss/crossentropy": 2.37065851688385, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3135601878166199, "step": 1264 }, { "epoch": 0.079125, "grad_norm": 5.0625, "grad_norm_var": 1.6016103108723958, "learning_rate": 0.0001, "loss": 8.8108, "loss/crossentropy": 2.330946683883667, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.28366419672966003, "step": 1266 }, { "epoch": 0.07925, "grad_norm": 4.125, "grad_norm_var": 1.5486399332682292, "learning_rate": 0.0001, "loss": 9.0762, "loss/crossentropy": 2.5277793407440186, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.317441463470459, "step": 1268 }, { "epoch": 0.079375, "grad_norm": 4.09375, "grad_norm_var": 1.5559234619140625, "learning_rate": 0.0001, "loss": 8.8627, "loss/crossentropy": 2.0369693636894226, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2870900481939316, "step": 1270 }, { "epoch": 0.0795, "grad_norm": 3.671875, "grad_norm_var": 0.6730794270833333, "learning_rate": 0.0001, "loss": 9.2479, "loss/crossentropy": 2.266517758369446, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3010869026184082, "step": 1272 }, { "epoch": 0.079625, "grad_norm": 4.84375, "grad_norm_var": 0.8135162353515625, "learning_rate": 0.0001, "loss": 9.2829, "loss/crossentropy": 2.4318493604660034, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3534676879644394, "step": 1274 }, { "epoch": 0.07975, "grad_norm": 3.75, "grad_norm_var": 0.7549112955729167, "learning_rate": 0.0001, "loss": 9.2051, "loss/crossentropy": 2.665824294090271, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3084219694137573, "step": 1276 }, { "epoch": 0.079875, "grad_norm": 4.09375, "grad_norm_var": 0.3110015869140625, "learning_rate": 0.0001, "loss": 9.3341, "loss/crossentropy": 2.585180401802063, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3485944867134094, "step": 1278 }, { "epoch": 0.08, "grad_norm": 3.890625, "grad_norm_var": 0.30613606770833335, "learning_rate": 0.0001, "loss": 9.0663, "loss/crossentropy": 2.235915422439575, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3006708025932312, "step": 1280 }, { "epoch": 0.080125, "grad_norm": 3.5625, "grad_norm_var": 0.28245442708333335, "learning_rate": 0.0001, "loss": 8.9244, "loss/crossentropy": 2.196950912475586, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.30859434604644775, "step": 1282 }, { "epoch": 0.08025, "grad_norm": 3.84375, "grad_norm_var": 0.27616780598958335, "learning_rate": 0.0001, "loss": 8.9921, "loss/crossentropy": 2.3487383127212524, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.31605543196201324, "step": 1284 }, { "epoch": 0.080375, "grad_norm": 3.625, "grad_norm_var": 0.27758687337239585, "learning_rate": 0.0001, "loss": 9.0989, "loss/crossentropy": 2.2177536487579346, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2946600914001465, "step": 1286 }, { "epoch": 0.0805, "grad_norm": 3.515625, "grad_norm_var": 0.28926493326822916, "learning_rate": 0.0001, "loss": 8.8901, "loss/crossentropy": 2.1754029989242554, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3095686435699463, "step": 1288 }, { "epoch": 0.080625, "grad_norm": 3.984375, "grad_norm_var": 0.04488932291666667, "learning_rate": 0.0001, "loss": 9.0607, "loss/crossentropy": 2.392609477043152, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2938263714313507, "step": 1290 }, { "epoch": 0.08075, "grad_norm": 4.1875, "grad_norm_var": 0.3003326416015625, "learning_rate": 0.0001, "loss": 9.5169, "loss/crossentropy": 2.4202769994735718, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3104136437177658, "step": 1292 }, { "epoch": 0.080875, "grad_norm": 3.640625, "grad_norm_var": 0.29915262858072916, "learning_rate": 0.0001, "loss": 8.8761, "loss/crossentropy": 2.2731767892837524, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.30449719727039337, "step": 1294 }, { "epoch": 0.081, "grad_norm": 3.59375, "grad_norm_var": 0.30543619791666665, "learning_rate": 0.0001, "loss": 9.0399, "loss/crossentropy": 2.470086932182312, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.28193579614162445, "step": 1296 }, { "epoch": 0.081125, "grad_norm": 3.40625, "grad_norm_var": 0.2979644775390625, "learning_rate": 0.0001, "loss": 9.0157, "loss/crossentropy": 2.167941153049469, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.29203882813453674, "step": 1298 }, { "epoch": 0.08125, "grad_norm": 3.734375, "grad_norm_var": 0.2990193684895833, "learning_rate": 0.0001, "loss": 9.1285, "loss/crossentropy": 2.2209893465042114, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.309230774641037, "step": 1300 }, { "epoch": 0.081375, "grad_norm": 3.671875, "grad_norm_var": 0.2967274983723958, "learning_rate": 0.0001, "loss": 9.0471, "loss/crossentropy": 2.2622058391571045, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.32858118414878845, "step": 1302 }, { "epoch": 0.0815, "grad_norm": 3.984375, "grad_norm_var": 0.2907786051432292, "learning_rate": 0.0001, "loss": 9.001, "loss/crossentropy": 2.550824522972107, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.30813509225845337, "step": 1304 }, { "epoch": 0.081625, "grad_norm": 3.9375, "grad_norm_var": 0.28088785807291666, "learning_rate": 0.0001, "loss": 9.1413, "loss/crossentropy": 2.234109878540039, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3101739436388016, "step": 1306 }, { "epoch": 0.08175, "grad_norm": 3.625, "grad_norm_var": 0.0467681884765625, "learning_rate": 0.0001, "loss": 9.1138, "loss/crossentropy": 2.6315842866897583, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3196646571159363, "step": 1308 }, { "epoch": 0.081875, "grad_norm": 3.453125, "grad_norm_var": 0.052586873372395836, "learning_rate": 0.0001, "loss": 8.8418, "loss/crossentropy": 2.262266516685486, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.29676589369773865, "step": 1310 }, { "epoch": 0.082, "grad_norm": 3.703125, "grad_norm_var": 0.049702962239583336, "learning_rate": 0.0001, "loss": 9.0204, "loss/crossentropy": 2.2959643602371216, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3009609580039978, "step": 1312 }, { "epoch": 0.082125, "grad_norm": 3.46875, "grad_norm_var": 0.03916727701822917, "learning_rate": 0.0001, "loss": 9.0814, "loss/crossentropy": 2.8231089115142822, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.320039302110672, "step": 1314 }, { "epoch": 0.08225, "grad_norm": 3.515625, "grad_norm_var": 0.043843587239583336, "learning_rate": 0.0001, "loss": 9.0418, "loss/crossentropy": 2.6723110675811768, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3218715190887451, "step": 1316 }, { "epoch": 0.082375, "grad_norm": 3.921875, "grad_norm_var": 0.04127197265625, "learning_rate": 0.0001, "loss": 9.1254, "loss/crossentropy": 2.817944288253784, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3088568150997162, "step": 1318 }, { "epoch": 0.0825, "grad_norm": 3.25, "grad_norm_var": 0.0599029541015625, "learning_rate": 0.0001, "loss": 8.9, "loss/crossentropy": 2.3980846405029297, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.2999623566865921, "step": 1320 }, { "epoch": 0.082625, "grad_norm": 3.84375, "grad_norm_var": 0.0474273681640625, "learning_rate": 0.0001, "loss": 9.3528, "loss/crossentropy": 2.595438838005066, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3496478945016861, "step": 1322 }, { "epoch": 0.08275, "grad_norm": 3.734375, "grad_norm_var": 0.0486724853515625, "learning_rate": 0.0001, "loss": 8.8593, "loss/crossentropy": 2.2254860401153564, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.2829415947198868, "step": 1324 }, { "epoch": 0.082875, "grad_norm": 3.484375, "grad_norm_var": 0.049723307291666664, "learning_rate": 0.0001, "loss": 8.9027, "loss/crossentropy": 2.44465708732605, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.32694484293460846, "step": 1326 }, { "epoch": 0.083, "grad_norm": 3.734375, "grad_norm_var": 0.053544108072916666, "learning_rate": 0.0001, "loss": 9.1538, "loss/crossentropy": 2.313889980316162, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.33095018565654755, "step": 1328 }, { "epoch": 0.083125, "grad_norm": 3.90625, "grad_norm_var": 0.049332682291666666, "learning_rate": 0.0001, "loss": 9.1456, "loss/crossentropy": 2.4496175050735474, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.30967962741851807, "step": 1330 }, { "epoch": 0.08325, "grad_norm": 3.5625, "grad_norm_var": 0.04842020670572917, "learning_rate": 0.0001, "loss": 9.0947, "loss/crossentropy": 2.386078953742981, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3234080672264099, "step": 1332 }, { "epoch": 0.083375, "grad_norm": 3.515625, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 8.7408, "loss/crossentropy": 2.339785575866699, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.32101643085479736, "step": 1334 }, { "epoch": 0.0835, "grad_norm": 4.03125, "grad_norm_var": 0.029230753580729168, "learning_rate": 0.0001, "loss": 8.8596, "loss/crossentropy": 2.6105300188064575, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.31350916624069214, "step": 1336 }, { "epoch": 0.083625, "grad_norm": 3.75, "grad_norm_var": 0.028864542643229168, "learning_rate": 0.0001, "loss": 8.63, "loss/crossentropy": 2.0749881863594055, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.28848570585250854, "step": 1338 }, { "epoch": 0.08375, "grad_norm": 3.59375, "grad_norm_var": 0.027977498372395833, "learning_rate": 0.0001, "loss": 9.1618, "loss/crossentropy": 2.243058681488037, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2843780219554901, "step": 1340 }, { "epoch": 0.083875, "grad_norm": 3.3125, "grad_norm_var": 0.042536417643229164, "learning_rate": 0.0001, "loss": 8.6702, "loss/crossentropy": 2.2894665002822876, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3040817379951477, "step": 1342 }, { "epoch": 0.084, "grad_norm": 3.53125, "grad_norm_var": 0.056151326497395834, "learning_rate": 0.0001, "loss": 9.1365, "loss/crossentropy": 2.210882782936096, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.32323381304740906, "step": 1344 }, { "epoch": 0.084125, "grad_norm": 4.125, "grad_norm_var": 0.08358968098958333, "learning_rate": 0.0001, "loss": 9.1776, "loss/crossentropy": 2.342850089073181, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2916054427623749, "step": 1346 }, { "epoch": 0.08425, "grad_norm": 3.671875, "grad_norm_var": 0.08816731770833333, "learning_rate": 0.0001, "loss": 9.2657, "loss/crossentropy": 2.44538152217865, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3160920739173889, "step": 1348 }, { "epoch": 0.084375, "grad_norm": 3.8125, "grad_norm_var": 0.08382059733072916, "learning_rate": 0.0001, "loss": 9.0063, "loss/crossentropy": 2.5489304065704346, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3081457316875458, "step": 1350 }, { "epoch": 0.0845, "grad_norm": 3.59375, "grad_norm_var": 0.0790191650390625, "learning_rate": 0.0001, "loss": 8.9541, "loss/crossentropy": 2.180498778820038, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.3018496334552765, "step": 1352 }, { "epoch": 0.084625, "grad_norm": 3.515625, "grad_norm_var": 0.080224609375, "learning_rate": 0.0001, "loss": 9.082, "loss/crossentropy": 2.384360671043396, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.28553615510463715, "step": 1354 }, { "epoch": 0.08475, "grad_norm": 3.6875, "grad_norm_var": 0.0791656494140625, "learning_rate": 0.0001, "loss": 9.0263, "loss/crossentropy": 2.6955255270004272, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3320935517549515, "step": 1356 }, { "epoch": 0.084875, "grad_norm": 3.375, "grad_norm_var": 0.05998942057291667, "learning_rate": 0.0001, "loss": 8.5942, "loss/crossentropy": 2.084562659263611, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2838726341724396, "step": 1358 }, { "epoch": 0.085, "grad_norm": 3.796875, "grad_norm_var": 0.046647135416666666, "learning_rate": 0.0001, "loss": 8.8832, "loss/crossentropy": 2.3465107679367065, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3064923733472824, "step": 1360 }, { "epoch": 0.085125, "grad_norm": 3.828125, "grad_norm_var": 0.025153605143229167, "learning_rate": 0.0001, "loss": 9.089, "loss/crossentropy": 2.3062328100204468, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.3037351071834564, "step": 1362 }, { "epoch": 0.08525, "grad_norm": 3.96875, "grad_norm_var": 0.031148274739583332, "learning_rate": 0.0001, "loss": 8.9159, "loss/crossentropy": 2.109978973865509, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3013267517089844, "step": 1364 }, { "epoch": 0.085375, "grad_norm": 3.59375, "grad_norm_var": 0.05119527180989583, "learning_rate": 0.0001, "loss": 8.9955, "loss/crossentropy": 2.1400793194770813, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2959713786840439, "step": 1366 }, { "epoch": 0.0855, "grad_norm": 3.609375, "grad_norm_var": 0.051985677083333334, "learning_rate": 0.0001, "loss": 9.0828, "loss/crossentropy": 2.583168387413025, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3330073952674866, "step": 1368 }, { "epoch": 0.085625, "grad_norm": 3.859375, "grad_norm_var": 0.07195638020833334, "learning_rate": 0.0001, "loss": 8.9546, "loss/crossentropy": 2.3753483295440674, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2922537922859192, "step": 1370 }, { "epoch": 0.08575, "grad_norm": 3.6875, "grad_norm_var": 0.07174072265625, "learning_rate": 0.0001, "loss": 8.9679, "loss/crossentropy": 2.4578174352645874, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.313604399561882, "step": 1372 }, { "epoch": 0.085875, "grad_norm": 4.8125, "grad_norm_var": 0.13062744140625, "learning_rate": 0.0001, "loss": 8.83, "loss/crossentropy": 2.1847041845321655, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29400117695331573, "step": 1374 }, { "epoch": 0.086, "grad_norm": 3.25, "grad_norm_var": 0.1586822509765625, "learning_rate": 0.0001, "loss": 8.8043, "loss/crossentropy": 2.2943379878997803, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.31389278173446655, "step": 1376 }, { "epoch": 0.086125, "grad_norm": 3.65625, "grad_norm_var": 0.16523030598958333, "learning_rate": 0.0001, "loss": 9.0582, "loss/crossentropy": 2.594352960586548, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.32820238173007965, "step": 1378 }, { "epoch": 0.08625, "grad_norm": 3.671875, "grad_norm_var": 0.15104878743489583, "learning_rate": 0.0001, "loss": 9.3228, "loss/crossentropy": 2.571452260017395, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3398682475090027, "step": 1380 }, { "epoch": 0.086375, "grad_norm": 3.5625, "grad_norm_var": 0.13977457682291666, "learning_rate": 0.0001, "loss": 8.9274, "loss/crossentropy": 2.431585431098938, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2900923192501068, "step": 1382 }, { "epoch": 0.0865, "grad_norm": 3.640625, "grad_norm_var": 0.14218343098958333, "learning_rate": 0.0001, "loss": 8.9046, "loss/crossentropy": 2.4001163244247437, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3200056552886963, "step": 1384 }, { "epoch": 0.086625, "grad_norm": 3.53125, "grad_norm_var": 0.12639058430989583, "learning_rate": 0.0001, "loss": 8.7451, "loss/crossentropy": 2.212107300758362, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.29481518268585205, "step": 1386 }, { "epoch": 0.08675, "grad_norm": 4.53125, "grad_norm_var": 0.17360738118489583, "learning_rate": 0.0001, "loss": 8.9308, "loss/crossentropy": 2.1325159072875977, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3161931484937668, "step": 1388 }, { "epoch": 0.086875, "grad_norm": 3.84375, "grad_norm_var": 0.08817952473958333, "learning_rate": 0.0001, "loss": 8.9893, "loss/crossentropy": 2.7963234186172485, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3157733827829361, "step": 1390 }, { "epoch": 0.087, "grad_norm": 3.578125, "grad_norm_var": 0.07779541015625, "learning_rate": 0.0001, "loss": 8.7993, "loss/crossentropy": 2.404562830924988, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3078538328409195, "step": 1392 }, { "epoch": 0.087125, "grad_norm": 3.390625, "grad_norm_var": 0.0818359375, "learning_rate": 0.0001, "loss": 8.9106, "loss/crossentropy": 2.411270022392273, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3172430843114853, "step": 1394 }, { "epoch": 0.08725, "grad_norm": 3.859375, "grad_norm_var": 0.1228515625, "learning_rate": 0.0001, "loss": 9.1999, "loss/crossentropy": 2.290405511856079, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2864595949649811, "step": 1396 }, { "epoch": 0.087375, "grad_norm": 3.5625, "grad_norm_var": 0.12275288899739584, "learning_rate": 0.0001, "loss": 9.0627, "loss/crossentropy": 2.3292382955551147, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.28290052711963654, "step": 1398 }, { "epoch": 0.0875, "grad_norm": 3.578125, "grad_norm_var": 0.12724202473958332, "learning_rate": 0.0001, "loss": 8.6953, "loss/crossentropy": 2.0091291666030884, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2793958783149719, "step": 1400 }, { "epoch": 0.087625, "grad_norm": 4.21875, "grad_norm_var": 0.13843994140625, "learning_rate": 0.0001, "loss": 8.9071, "loss/crossentropy": 2.6832462549209595, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3183281272649765, "step": 1402 }, { "epoch": 0.08775, "grad_norm": 3.3125, "grad_norm_var": 0.10543212890625, "learning_rate": 0.0001, "loss": 8.9968, "loss/crossentropy": 2.267482042312622, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3442094475030899, "step": 1404 }, { "epoch": 0.087875, "grad_norm": 3.46875, "grad_norm_var": 0.10003255208333334, "learning_rate": 0.0001, "loss": 8.8766, "loss/crossentropy": 2.3723970651626587, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.293814942240715, "step": 1406 }, { "epoch": 0.088, "grad_norm": 3.75, "grad_norm_var": 0.10246988932291666, "learning_rate": 0.0001, "loss": 8.8107, "loss/crossentropy": 2.3082317113876343, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3221370577812195, "step": 1408 }, { "epoch": 0.088125, "grad_norm": 3.765625, "grad_norm_var": 0.09614969889322916, "learning_rate": 0.0001, "loss": 8.956, "loss/crossentropy": 2.4758397340774536, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.33573949337005615, "step": 1410 }, { "epoch": 0.08825, "grad_norm": 3.578125, "grad_norm_var": 0.05336812337239583, "learning_rate": 0.0001, "loss": 8.5994, "loss/crossentropy": 2.408522605895996, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29929833114147186, "step": 1412 }, { "epoch": 0.088375, "grad_norm": 3.640625, "grad_norm_var": 0.05576883951822917, "learning_rate": 0.0001, "loss": 9.2281, "loss/crossentropy": 2.563318610191345, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.32796354591846466, "step": 1414 }, { "epoch": 0.0885, "grad_norm": 3.4375, "grad_norm_var": 0.05320536295572917, "learning_rate": 0.0001, "loss": 9.0656, "loss/crossentropy": 2.5199949741363525, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.29131297767162323, "step": 1416 }, { "epoch": 0.088625, "grad_norm": 3.796875, "grad_norm_var": 0.031525675455729166, "learning_rate": 0.0001, "loss": 8.8611, "loss/crossentropy": 2.536887049674988, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3120550215244293, "step": 1418 }, { "epoch": 0.08875, "grad_norm": 3.4375, "grad_norm_var": 0.0244293212890625, "learning_rate": 0.0001, "loss": 8.9123, "loss/crossentropy": 2.215959906578064, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2943772077560425, "step": 1420 }, { "epoch": 0.088875, "grad_norm": 3.546875, "grad_norm_var": 0.023173014322916668, "learning_rate": 0.0001, "loss": 8.9674, "loss/crossentropy": 2.1649523973464966, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3028850704431534, "step": 1422 }, { "epoch": 0.089, "grad_norm": 3.328125, "grad_norm_var": 0.07156575520833333, "learning_rate": 0.0001, "loss": 8.9571, "loss/crossentropy": 2.379367709159851, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.32595328986644745, "step": 1424 }, { "epoch": 0.089125, "grad_norm": 3.796875, "grad_norm_var": 0.07255859375, "learning_rate": 0.0001, "loss": 9.0234, "loss/crossentropy": 2.441414713859558, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3373495191335678, "step": 1426 }, { "epoch": 0.08925, "grad_norm": 3.8125, "grad_norm_var": 0.07247721354166667, "learning_rate": 0.0001, "loss": 9.2807, "loss/crossentropy": 2.720638632774353, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3340994864702225, "step": 1428 }, { "epoch": 0.089375, "grad_norm": 3.984375, "grad_norm_var": 0.07625325520833333, "learning_rate": 0.0001, "loss": 8.9398, "loss/crossentropy": 2.171198010444641, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3092898577451706, "step": 1430 }, { "epoch": 0.0895, "grad_norm": 3.59375, "grad_norm_var": 0.07291259765625, "learning_rate": 0.0001, "loss": 9.0595, "loss/crossentropy": 2.5796386003494263, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.33887575566768646, "step": 1432 }, { "epoch": 0.089625, "grad_norm": 3.5, "grad_norm_var": 0.07304280598958333, "learning_rate": 0.0001, "loss": 8.7837, "loss/crossentropy": 2.264691114425659, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2661140263080597, "step": 1434 }, { "epoch": 0.08975, "grad_norm": 4.09375, "grad_norm_var": 0.08142903645833334, "learning_rate": 0.0001, "loss": 8.9487, "loss/crossentropy": 2.474991798400879, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.308430016040802, "step": 1436 }, { "epoch": 0.089875, "grad_norm": 3.8125, "grad_norm_var": 0.07261962890625, "learning_rate": 0.0001, "loss": 8.9615, "loss/crossentropy": 2.464845299720764, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.29659199714660645, "step": 1438 }, { "epoch": 0.09, "grad_norm": 3.421875, "grad_norm_var": 0.034989420572916666, "learning_rate": 0.0001, "loss": 8.7208, "loss/crossentropy": 2.349491000175476, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31345631182193756, "step": 1440 }, { "epoch": 0.090125, "grad_norm": 3.578125, "grad_norm_var": 0.063623046875, "learning_rate": 0.0001, "loss": 9.1662, "loss/crossentropy": 2.5956228971481323, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.30429892241954803, "step": 1442 }, { "epoch": 0.09025, "grad_norm": 3.71875, "grad_norm_var": 0.06392822265625, "learning_rate": 0.0001, "loss": 8.7371, "loss/crossentropy": 2.3084046840667725, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2794123440980911, "step": 1444 }, { "epoch": 0.090375, "grad_norm": 4.21875, "grad_norm_var": 0.08909098307291667, "learning_rate": 0.0001, "loss": 8.6431, "loss/crossentropy": 2.0833881497383118, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2675230801105499, "step": 1446 }, { "epoch": 0.0905, "grad_norm": 3.640625, "grad_norm_var": 0.08801676432291666, "learning_rate": 0.0001, "loss": 9.0263, "loss/crossentropy": 2.3375617265701294, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3266754746437073, "step": 1448 }, { "epoch": 0.090625, "grad_norm": 4.9375, "grad_norm_var": 0.2819295247395833, "learning_rate": 0.0001, "loss": 9.0776, "loss/crossentropy": 2.259764075279236, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.29133765399456024, "step": 1450 }, { "epoch": 0.09075, "grad_norm": 3.0625, "grad_norm_var": 0.3140777587890625, "learning_rate": 0.0001, "loss": 8.755, "loss/crossentropy": 2.4632983207702637, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.30049507319927216, "step": 1452 }, { "epoch": 0.090875, "grad_norm": 3.640625, "grad_norm_var": 0.3232086181640625, "learning_rate": 0.0001, "loss": 8.8504, "loss/crossentropy": 2.3947253227233887, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2989191859960556, "step": 1454 }, { "epoch": 0.091, "grad_norm": 3.59375, "grad_norm_var": 0.3135579427083333, "learning_rate": 0.0001, "loss": 8.7741, "loss/crossentropy": 2.366453766822815, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2914280444383621, "step": 1456 }, { "epoch": 0.091125, "grad_norm": 3.28125, "grad_norm_var": 0.32301432291666665, "learning_rate": 0.0001, "loss": 8.6575, "loss/crossentropy": 2.1803172826766968, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3163032829761505, "step": 1458 }, { "epoch": 0.09125, "grad_norm": 3.796875, "grad_norm_var": 0.32535400390625, "learning_rate": 0.0001, "loss": 9.0737, "loss/crossentropy": 2.459627389907837, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.31055358052253723, "step": 1460 }, { "epoch": 0.091375, "grad_norm": 3.5, "grad_norm_var": 0.31604410807291666, "learning_rate": 0.0001, "loss": 8.8823, "loss/crossentropy": 2.204772710800171, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.27741169929504395, "step": 1462 }, { "epoch": 0.0915, "grad_norm": 3.453125, "grad_norm_var": 0.33539937337239584, "learning_rate": 0.0001, "loss": 8.8218, "loss/crossentropy": 2.3198187351226807, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.3104029595851898, "step": 1464 }, { "epoch": 0.091625, "grad_norm": 3.515625, "grad_norm_var": 0.08391520182291666, "learning_rate": 0.0001, "loss": 8.7448, "loss/crossentropy": 2.183798313140869, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31394851207733154, "step": 1466 }, { "epoch": 0.09175, "grad_norm": 3.6875, "grad_norm_var": 0.07111714680989584, "learning_rate": 0.0001, "loss": 9.1398, "loss/crossentropy": 2.5468273162841797, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3174278736114502, "step": 1468 }, { "epoch": 0.091875, "grad_norm": 3.421875, "grad_norm_var": 0.07214253743489583, "learning_rate": 0.0001, "loss": 8.887, "loss/crossentropy": 2.617543339729309, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.28839461505413055, "step": 1470 }, { "epoch": 0.092, "grad_norm": 3.640625, "grad_norm_var": 0.05963134765625, "learning_rate": 0.0001, "loss": 8.6417, "loss/crossentropy": 2.598427653312683, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.32050472497940063, "step": 1472 }, { "epoch": 0.092125, "grad_norm": 3.625, "grad_norm_var": 0.047883097330729166, "learning_rate": 0.0001, "loss": 9.0224, "loss/crossentropy": 2.545255661010742, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.30465172231197357, "step": 1474 }, { "epoch": 0.09225, "grad_norm": 3.640625, "grad_norm_var": 0.049267578125, "learning_rate": 0.0001, "loss": 8.8417, "loss/crossentropy": 2.4060131311416626, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.286019504070282, "step": 1476 }, { "epoch": 0.092375, "grad_norm": 3.6875, "grad_norm_var": 0.022591145833333333, "learning_rate": 0.0001, "loss": 8.7607, "loss/crossentropy": 2.196686089038849, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2895347326993942, "step": 1478 }, { "epoch": 0.0925, "grad_norm": 3.484375, "grad_norm_var": 0.03193257649739583, "learning_rate": 0.0001, "loss": 8.9549, "loss/crossentropy": 2.2161173820495605, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3376694321632385, "step": 1480 }, { "epoch": 0.092625, "grad_norm": 3.8125, "grad_norm_var": 0.031916300455729164, "learning_rate": 0.0001, "loss": 8.9692, "loss/crossentropy": 2.4159319400787354, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.33057647943496704, "step": 1482 }, { "epoch": 0.09275, "grad_norm": 3.328125, "grad_norm_var": 0.036844889322916664, "learning_rate": 0.0001, "loss": 8.7113, "loss/crossentropy": 2.301741361618042, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31546755135059357, "step": 1484 }, { "epoch": 0.092875, "grad_norm": 3.515625, "grad_norm_var": 0.0347076416015625, "learning_rate": 0.0001, "loss": 8.9267, "loss/crossentropy": 2.2651939392089844, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.30455446243286133, "step": 1486 }, { "epoch": 0.093, "grad_norm": 3.75, "grad_norm_var": 0.0397613525390625, "learning_rate": 0.0001, "loss": 9.0545, "loss/crossentropy": 2.5079206228256226, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3329392969608307, "step": 1488 }, { "epoch": 0.093125, "grad_norm": 3.625, "grad_norm_var": 0.04644775390625, "learning_rate": 0.0001, "loss": 9.1031, "loss/crossentropy": 2.3883864879608154, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.31360116600990295, "step": 1490 }, { "epoch": 0.09325, "grad_norm": 3.96875, "grad_norm_var": 0.04755859375, "learning_rate": 0.0001, "loss": 9.1668, "loss/crossentropy": 2.251497983932495, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.32022619247436523, "step": 1492 }, { "epoch": 0.093375, "grad_norm": 3.609375, "grad_norm_var": 0.0482818603515625, "learning_rate": 0.0001, "loss": 8.7119, "loss/crossentropy": 2.223703145980835, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2985079288482666, "step": 1494 }, { "epoch": 0.0935, "grad_norm": 3.390625, "grad_norm_var": 0.04791259765625, "learning_rate": 0.0001, "loss": 8.7767, "loss/crossentropy": 2.3311924934387207, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29468269646167755, "step": 1496 }, { "epoch": 0.093625, "grad_norm": 3.65625, "grad_norm_var": 0.05221354166666667, "learning_rate": 0.0001, "loss": 8.7455, "loss/crossentropy": 2.2993088960647583, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.31247611343860626, "step": 1498 }, { "epoch": 0.09375, "grad_norm": 3.34375, "grad_norm_var": 0.053376261393229166, "learning_rate": 0.0001, "loss": 8.7313, "loss/crossentropy": 2.2700235843658447, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2984912842512131, "step": 1500 }, { "epoch": 0.093875, "grad_norm": 4.84375, "grad_norm_var": 0.15165608723958332, "learning_rate": 0.0001, "loss": 8.8359, "loss/crossentropy": 2.365916609764099, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.28694379329681396, "step": 1502 }, { "epoch": 0.094, "grad_norm": 4.09375, "grad_norm_var": 0.15507710774739583, "learning_rate": 0.0001, "loss": 8.9381, "loss/crossentropy": 2.357543706893921, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3162877708673477, "step": 1504 }, { "epoch": 0.094125, "grad_norm": 3.421875, "grad_norm_var": 0.16050516764322917, "learning_rate": 0.0001, "loss": 8.6509, "loss/crossentropy": 2.235751748085022, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.31035009026527405, "step": 1506 }, { "epoch": 0.09425, "grad_norm": 3.84375, "grad_norm_var": 0.15790913899739584, "learning_rate": 0.0001, "loss": 8.9785, "loss/crossentropy": 2.421581745147705, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30988195538520813, "step": 1508 }, { "epoch": 0.094375, "grad_norm": 3.625, "grad_norm_var": 0.1599761962890625, "learning_rate": 0.0001, "loss": 8.9374, "loss/crossentropy": 2.314875602722168, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2935473322868347, "step": 1510 }, { "epoch": 0.0945, "grad_norm": 4.0, "grad_norm_var": 0.14195556640625, "learning_rate": 0.0001, "loss": 9.2329, "loss/crossentropy": 2.3859556913375854, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.34312979876995087, "step": 1512 }, { "epoch": 0.094625, "grad_norm": 3.640625, "grad_norm_var": 0.135791015625, "learning_rate": 0.0001, "loss": 8.8305, "loss/crossentropy": 2.4759390354156494, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.30718210339546204, "step": 1514 }, { "epoch": 0.09475, "grad_norm": 3.4375, "grad_norm_var": 0.13124593098958334, "learning_rate": 0.0001, "loss": 8.6304, "loss/crossentropy": 2.4228183031082153, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2753960192203522, "step": 1516 }, { "epoch": 0.094875, "grad_norm": 3.4375, "grad_norm_var": 0.04909566243489583, "learning_rate": 0.0001, "loss": 8.9843, "loss/crossentropy": 2.4210203886032104, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.30042560398578644, "step": 1518 }, { "epoch": 0.095, "grad_norm": 4.03125, "grad_norm_var": 0.04656575520833333, "learning_rate": 0.0001, "loss": 9.012, "loss/crossentropy": 2.45758593082428, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.29631373286247253, "step": 1520 }, { "epoch": 0.095125, "grad_norm": 3.265625, "grad_norm_var": 0.049576822916666666, "learning_rate": 0.0001, "loss": 8.5905, "loss/crossentropy": 2.4333043098449707, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2933865636587143, "step": 1522 }, { "epoch": 0.09525, "grad_norm": 3.71875, "grad_norm_var": 0.04767252604166667, "learning_rate": 0.0001, "loss": 9.0024, "loss/crossentropy": 2.32417368888855, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3141182065010071, "step": 1524 }, { "epoch": 0.095375, "grad_norm": 3.625, "grad_norm_var": 0.0506988525390625, "learning_rate": 0.0001, "loss": 9.1668, "loss/crossentropy": 2.473471999168396, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3085293173789978, "step": 1526 }, { "epoch": 0.0955, "grad_norm": 3.625, "grad_norm_var": 0.0439605712890625, "learning_rate": 0.0001, "loss": 8.8465, "loss/crossentropy": 2.4285165071487427, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2852857708930969, "step": 1528 }, { "epoch": 0.095625, "grad_norm": 3.421875, "grad_norm_var": 0.0430328369140625, "learning_rate": 0.0001, "loss": 8.6667, "loss/crossentropy": 2.263743579387665, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2716449797153473, "step": 1530 }, { "epoch": 0.09575, "grad_norm": 3.421875, "grad_norm_var": 0.045979817708333336, "learning_rate": 0.0001, "loss": 8.8735, "loss/crossentropy": 2.331640362739563, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28873661160469055, "step": 1532 }, { "epoch": 0.095875, "grad_norm": 3.40625, "grad_norm_var": 0.04674072265625, "learning_rate": 0.0001, "loss": 8.8375, "loss/crossentropy": 2.4132364988327026, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.285625159740448, "step": 1534 }, { "epoch": 0.096, "grad_norm": 3.375, "grad_norm_var": 0.039728800455729164, "learning_rate": 0.0001, "loss": 8.7776, "loss/crossentropy": 2.4001948833465576, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.31654833257198334, "step": 1536 }, { "epoch": 0.096125, "grad_norm": 3.796875, "grad_norm_var": 0.03581441243489583, "learning_rate": 0.0001, "loss": 8.8068, "loss/crossentropy": 2.258412718772888, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2980117201805115, "step": 1538 }, { "epoch": 0.09625, "grad_norm": 3.5625, "grad_norm_var": 0.032958984375, "learning_rate": 0.0001, "loss": 8.3933, "loss/crossentropy": 2.3027628660202026, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28253039717674255, "step": 1540 }, { "epoch": 0.096375, "grad_norm": 3.453125, "grad_norm_var": 0.024409993489583334, "learning_rate": 0.0001, "loss": 8.6451, "loss/crossentropy": 2.3603252172470093, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2751441150903702, "step": 1542 }, { "epoch": 0.0965, "grad_norm": 3.5, "grad_norm_var": 0.029710896809895835, "learning_rate": 0.0001, "loss": 8.6759, "loss/crossentropy": 2.5256091356277466, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.26941487193107605, "step": 1544 }, { "epoch": 0.096625, "grad_norm": 4.0, "grad_norm_var": 0.04315999348958333, "learning_rate": 0.0001, "loss": 9.1672, "loss/crossentropy": 2.4796286821365356, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3184451460838318, "step": 1546 }, { "epoch": 0.09675, "grad_norm": 3.5625, "grad_norm_var": 0.04156901041666667, "learning_rate": 0.0001, "loss": 8.8819, "loss/crossentropy": 2.4188071489334106, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.31058116257190704, "step": 1548 }, { "epoch": 0.096875, "grad_norm": 3.3125, "grad_norm_var": 0.045441691080729166, "learning_rate": 0.0001, "loss": 8.7315, "loss/crossentropy": 2.318482995033264, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2847931683063507, "step": 1550 }, { "epoch": 0.097, "grad_norm": 3.578125, "grad_norm_var": 0.05179036458333333, "learning_rate": 0.0001, "loss": 8.8762, "loss/crossentropy": 2.24004590511322, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28266097605228424, "step": 1552 }, { "epoch": 0.097125, "grad_norm": 3.171875, "grad_norm_var": 0.05720926920572917, "learning_rate": 0.0001, "loss": 8.6779, "loss/crossentropy": 2.10389643907547, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2705800235271454, "step": 1554 }, { "epoch": 0.09725, "grad_norm": 3.671875, "grad_norm_var": 0.06609598795572917, "learning_rate": 0.0001, "loss": 8.7698, "loss/crossentropy": 2.2352887392044067, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.30223119258880615, "step": 1556 }, { "epoch": 0.097375, "grad_norm": 3.953125, "grad_norm_var": 0.09449869791666667, "learning_rate": 0.0001, "loss": 9.038, "loss/crossentropy": 2.4962942600250244, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3383851647377014, "step": 1558 }, { "epoch": 0.0975, "grad_norm": 3.484375, "grad_norm_var": 0.07860921223958334, "learning_rate": 0.0001, "loss": 8.6868, "loss/crossentropy": 2.388336658477783, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2802818864583969, "step": 1560 }, { "epoch": 0.097625, "grad_norm": 3.40625, "grad_norm_var": 0.06916910807291667, "learning_rate": 0.0001, "loss": 8.8473, "loss/crossentropy": 2.4475581645965576, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2849755436182022, "step": 1562 }, { "epoch": 0.09775, "grad_norm": 3.203125, "grad_norm_var": 0.0787506103515625, "learning_rate": 0.0001, "loss": 8.5789, "loss/crossentropy": 2.5317403078079224, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.29401715099811554, "step": 1564 }, { "epoch": 0.097875, "grad_norm": 3.671875, "grad_norm_var": 0.08131103515625, "learning_rate": 0.0001, "loss": 8.9867, "loss/crossentropy": 2.547517776489258, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3130339980125427, "step": 1566 }, { "epoch": 0.098, "grad_norm": 3.625, "grad_norm_var": 0.07197265625, "learning_rate": 0.0001, "loss": 8.6146, "loss/crossentropy": 2.1298526525497437, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2899221330881119, "step": 1568 }, { "epoch": 0.098125, "grad_norm": 3.4375, "grad_norm_var": 0.06285400390625, "learning_rate": 0.0001, "loss": 8.9054, "loss/crossentropy": 2.4564274549484253, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.313205823302269, "step": 1570 }, { "epoch": 0.09825, "grad_norm": 3.546875, "grad_norm_var": 0.05227864583333333, "learning_rate": 0.0001, "loss": 9.0662, "loss/crossentropy": 2.246076822280884, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2935841381549835, "step": 1572 }, { "epoch": 0.098375, "grad_norm": 3.3125, "grad_norm_var": 0.027961222330729167, "learning_rate": 0.0001, "loss": 8.9055, "loss/crossentropy": 2.38775098323822, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.32380862534046173, "step": 1574 }, { "epoch": 0.0985, "grad_norm": 3.859375, "grad_norm_var": 0.035700480143229164, "learning_rate": 0.0001, "loss": 8.7802, "loss/crossentropy": 2.3213651180267334, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2993357926607132, "step": 1576 }, { "epoch": 0.098625, "grad_norm": 3.21875, "grad_norm_var": 0.04185282389322917, "learning_rate": 0.0001, "loss": 8.5289, "loss/crossentropy": 2.0965049266815186, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.279249906539917, "step": 1578 }, { "epoch": 0.09875, "grad_norm": 3.484375, "grad_norm_var": 0.03489481608072917, "learning_rate": 0.0001, "loss": 8.6456, "loss/crossentropy": 2.1175107955932617, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.273018017411232, "step": 1580 }, { "epoch": 0.098875, "grad_norm": 3.328125, "grad_norm_var": 0.032796223958333336, "learning_rate": 0.0001, "loss": 8.9137, "loss/crossentropy": 2.547134518623352, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3091147541999817, "step": 1582 }, { "epoch": 0.099, "grad_norm": 3.421875, "grad_norm_var": 0.033589680989583336, "learning_rate": 0.0001, "loss": 8.678, "loss/crossentropy": 2.406341552734375, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2856515645980835, "step": 1584 }, { "epoch": 0.099125, "grad_norm": 3.859375, "grad_norm_var": 0.06271870930989583, "learning_rate": 0.0001, "loss": 8.8898, "loss/crossentropy": 2.435731291770935, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29519233107566833, "step": 1586 }, { "epoch": 0.09925, "grad_norm": 3.9375, "grad_norm_var": 0.0727447509765625, "learning_rate": 0.0001, "loss": 8.6688, "loss/crossentropy": 2.3164761066436768, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3253418505191803, "step": 1588 }, { "epoch": 0.099375, "grad_norm": 3.390625, "grad_norm_var": 0.070654296875, "learning_rate": 0.0001, "loss": 8.6513, "loss/crossentropy": 2.2719457149505615, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30052798986434937, "step": 1590 }, { "epoch": 0.0995, "grad_norm": 3.90625, "grad_norm_var": 0.07297770182291667, "learning_rate": 0.0001, "loss": 9.1721, "loss/crossentropy": 2.5084248781204224, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.33814041316509247, "step": 1592 }, { "epoch": 0.099625, "grad_norm": 3.40625, "grad_norm_var": 0.06483968098958333, "learning_rate": 0.0001, "loss": 8.9066, "loss/crossentropy": 2.382421851158142, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.3085876405239105, "step": 1594 }, { "epoch": 0.09975, "grad_norm": 3.359375, "grad_norm_var": 0.07033589680989584, "learning_rate": 0.0001, "loss": 8.7353, "loss/crossentropy": 2.3208523988723755, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.3062067925930023, "step": 1596 }, { "epoch": 0.099875, "grad_norm": 3.3125, "grad_norm_var": 0.06741434733072917, "learning_rate": 0.0001, "loss": 8.7032, "loss/crossentropy": 2.4263393878936768, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.31763342022895813, "step": 1598 }, { "epoch": 0.1, "grad_norm": 3.21875, "grad_norm_var": 0.07450764973958333, "learning_rate": 0.0001, "loss": 8.6868, "loss/crossentropy": 2.247215509414673, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.28044338524341583, "step": 1600 }, { "epoch": 0.100125, "grad_norm": 3.640625, "grad_norm_var": 0.04682515462239583, "learning_rate": 0.0001, "loss": 8.7077, "loss/crossentropy": 2.3089388608932495, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28411509096622467, "step": 1602 }, { "epoch": 0.10025, "grad_norm": 3.609375, "grad_norm_var": 0.046296183268229166, "learning_rate": 0.0001, "loss": 9.043, "loss/crossentropy": 2.711973190307617, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.310742050409317, "step": 1604 }, { "epoch": 0.100375, "grad_norm": 3.484375, "grad_norm_var": 0.05012919108072917, "learning_rate": 0.0001, "loss": 8.7587, "loss/crossentropy": 2.206353545188904, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.30005398392677307, "step": 1606 }, { "epoch": 0.1005, "grad_norm": 3.671875, "grad_norm_var": 0.042740885416666666, "learning_rate": 0.0001, "loss": 8.7516, "loss/crossentropy": 2.3755295276641846, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28789104521274567, "step": 1608 }, { "epoch": 0.100625, "grad_norm": 3.34375, "grad_norm_var": 0.04962565104166667, "learning_rate": 0.0001, "loss": 8.9675, "loss/crossentropy": 2.5117024183273315, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2872663736343384, "step": 1610 }, { "epoch": 0.10075, "grad_norm": 3.859375, "grad_norm_var": 0.05524088541666667, "learning_rate": 0.0001, "loss": 8.8949, "loss/crossentropy": 2.1095504760742188, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.26681819558143616, "step": 1612 }, { "epoch": 0.100875, "grad_norm": 3.71875, "grad_norm_var": 0.05439351399739583, "learning_rate": 0.0001, "loss": 9.0907, "loss/crossentropy": 2.536198377609253, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3001484125852585, "step": 1614 }, { "epoch": 0.101, "grad_norm": 3.4375, "grad_norm_var": 0.044331868489583336, "learning_rate": 0.0001, "loss": 8.6285, "loss/crossentropy": 2.444958448410034, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2898731231689453, "step": 1616 }, { "epoch": 0.101125, "grad_norm": 3.5, "grad_norm_var": 0.04670308430989583, "learning_rate": 0.0001, "loss": 8.5124, "loss/crossentropy": 2.492483615875244, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29097048938274384, "step": 1618 }, { "epoch": 0.10125, "grad_norm": 3.328125, "grad_norm_var": 0.039159138997395836, "learning_rate": 0.0001, "loss": 8.5946, "loss/crossentropy": 2.3776297569274902, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2919171154499054, "step": 1620 }, { "epoch": 0.101375, "grad_norm": 3.625, "grad_norm_var": 0.039704386393229166, "learning_rate": 0.0001, "loss": 8.8745, "loss/crossentropy": 2.458945870399475, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.31617042422294617, "step": 1622 }, { "epoch": 0.1015, "grad_norm": 4.5, "grad_norm_var": 0.09348551432291667, "learning_rate": 0.0001, "loss": 8.7595, "loss/crossentropy": 2.2380698919296265, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.32717761397361755, "step": 1624 }, { "epoch": 0.101625, "grad_norm": 3.40625, "grad_norm_var": 0.0921051025390625, "learning_rate": 0.0001, "loss": 8.8646, "loss/crossentropy": 2.504348874092102, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.29782192409038544, "step": 1626 }, { "epoch": 0.10175, "grad_norm": 3.4375, "grad_norm_var": 0.09677734375, "learning_rate": 0.0001, "loss": 8.5019, "loss/crossentropy": 2.281771183013916, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2809130549430847, "step": 1628 }, { "epoch": 0.101875, "grad_norm": 3.671875, "grad_norm_var": 0.09638671875, "learning_rate": 0.0001, "loss": 8.7456, "loss/crossentropy": 2.131038784980774, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2949763238430023, "step": 1630 }, { "epoch": 0.102, "grad_norm": 4.0625, "grad_norm_var": 0.11483968098958333, "learning_rate": 0.0001, "loss": 8.5063, "loss/crossentropy": 2.3742181062698364, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30859068036079407, "step": 1632 }, { "epoch": 0.102125, "grad_norm": 3.609375, "grad_norm_var": 0.1124176025390625, "learning_rate": 0.0001, "loss": 8.5566, "loss/crossentropy": 2.250689148902893, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2848288118839264, "step": 1634 }, { "epoch": 0.10225, "grad_norm": 3.46875, "grad_norm_var": 0.10181884765625, "learning_rate": 0.0001, "loss": 8.7109, "loss/crossentropy": 2.4584755897521973, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29475922882556915, "step": 1636 }, { "epoch": 0.102375, "grad_norm": 3.65625, "grad_norm_var": 0.10305074055989584, "learning_rate": 0.0001, "loss": 8.5859, "loss/crossentropy": 2.243297576904297, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2687932550907135, "step": 1638 }, { "epoch": 0.1025, "grad_norm": 3.28125, "grad_norm_var": 0.05028889973958333, "learning_rate": 0.0001, "loss": 8.807, "loss/crossentropy": 2.3591182231903076, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2878338694572449, "step": 1640 }, { "epoch": 0.102625, "grad_norm": 3.40625, "grad_norm_var": 0.0458892822265625, "learning_rate": 0.0001, "loss": 8.7064, "loss/crossentropy": 2.53536856174469, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.28173747658729553, "step": 1642 }, { "epoch": 0.10275, "grad_norm": 3.65625, "grad_norm_var": 0.043879191080729164, "learning_rate": 0.0001, "loss": 8.5829, "loss/crossentropy": 2.5345053672790527, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.30797943472862244, "step": 1644 }, { "epoch": 0.102875, "grad_norm": 3.6875, "grad_norm_var": 0.04512430826822917, "learning_rate": 0.0001, "loss": 8.5724, "loss/crossentropy": 2.3095182180404663, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2998042106628418, "step": 1646 }, { "epoch": 0.103, "grad_norm": 3.484375, "grad_norm_var": 0.0231842041015625, "learning_rate": 0.0001, "loss": 8.6461, "loss/crossentropy": 2.2530897855758667, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.30125299096107483, "step": 1648 }, { "epoch": 0.103125, "grad_norm": 3.53125, "grad_norm_var": 0.021875, "learning_rate": 0.0001, "loss": 8.878, "loss/crossentropy": 2.557194471359253, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28575199842453003, "step": 1650 }, { "epoch": 0.10325, "grad_norm": 3.359375, "grad_norm_var": 0.0226226806640625, "learning_rate": 0.0001, "loss": 8.5783, "loss/crossentropy": 2.4618316888809204, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.305898517370224, "step": 1652 }, { "epoch": 0.103375, "grad_norm": 3.65625, "grad_norm_var": 0.02310791015625, "learning_rate": 0.0001, "loss": 8.5743, "loss/crossentropy": 2.524722933769226, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.29061686992645264, "step": 1654 }, { "epoch": 0.1035, "grad_norm": 3.4375, "grad_norm_var": 0.022142537434895835, "learning_rate": 0.0001, "loss": 8.4769, "loss/crossentropy": 2.602474570274353, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2977643758058548, "step": 1656 }, { "epoch": 0.103625, "grad_norm": 3.4375, "grad_norm_var": 0.022086588541666667, "learning_rate": 0.0001, "loss": 8.7637, "loss/crossentropy": 2.364400863647461, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.31225910782814026, "step": 1658 }, { "epoch": 0.10375, "grad_norm": 3.828125, "grad_norm_var": 0.02642822265625, "learning_rate": 0.0001, "loss": 8.6777, "loss/crossentropy": 2.4924440383911133, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3066846579313278, "step": 1660 }, { "epoch": 0.103875, "grad_norm": 3.59375, "grad_norm_var": 0.02388916015625, "learning_rate": 0.0001, "loss": 8.4898, "loss/crossentropy": 2.183597683906555, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2970152199268341, "step": 1662 }, { "epoch": 0.104, "grad_norm": 3.34375, "grad_norm_var": 0.02330322265625, "learning_rate": 0.0001, "loss": 8.6848, "loss/crossentropy": 2.5557087659835815, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.29651692509651184, "step": 1664 }, { "epoch": 0.104125, "grad_norm": 3.328125, "grad_norm_var": 0.0264312744140625, "learning_rate": 0.0001, "loss": 8.6747, "loss/crossentropy": 2.3465185165405273, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.30662697553634644, "step": 1666 }, { "epoch": 0.10425, "grad_norm": 3.3125, "grad_norm_var": 0.0306793212890625, "learning_rate": 0.0001, "loss": 8.6526, "loss/crossentropy": 2.4796379804611206, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.3008027672767639, "step": 1668 }, { "epoch": 0.104375, "grad_norm": 3.9375, "grad_norm_var": 0.04387613932291667, "learning_rate": 0.0001, "loss": 8.9315, "loss/crossentropy": 2.4333351850509644, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.29662713408470154, "step": 1670 }, { "epoch": 0.1045, "grad_norm": 3.3125, "grad_norm_var": 0.08879801432291666, "learning_rate": 0.0001, "loss": 8.6483, "loss/crossentropy": 2.0529088377952576, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.283397376537323, "step": 1672 }, { "epoch": 0.104625, "grad_norm": 3.359375, "grad_norm_var": 0.089306640625, "learning_rate": 0.0001, "loss": 8.8812, "loss/crossentropy": 2.5718494653701782, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.32283854484558105, "step": 1674 }, { "epoch": 0.10475, "grad_norm": 3.515625, "grad_norm_var": 0.07821858723958333, "learning_rate": 0.0001, "loss": 8.6264, "loss/crossentropy": 2.301028847694397, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.3003203123807907, "step": 1676 }, { "epoch": 0.104875, "grad_norm": 3.171875, "grad_norm_var": 0.08322652180989583, "learning_rate": 0.0001, "loss": 8.5515, "loss/crossentropy": 2.2602317333221436, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26862217485904694, "step": 1678 }, { "epoch": 0.105, "grad_norm": 3.5625, "grad_norm_var": 0.08742574055989584, "learning_rate": 0.0001, "loss": 8.5586, "loss/crossentropy": 2.6255671977996826, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.3011835664510727, "step": 1680 }, { "epoch": 0.105125, "grad_norm": 3.109375, "grad_norm_var": 0.09278055826822916, "learning_rate": 0.0001, "loss": 8.6635, "loss/crossentropy": 2.5335636138916016, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28424978256225586, "step": 1682 }, { "epoch": 0.10525, "grad_norm": 3.421875, "grad_norm_var": 0.08586324055989583, "learning_rate": 0.0001, "loss": 8.6685, "loss/crossentropy": 2.275408983230591, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2945568561553955, "step": 1684 }, { "epoch": 0.105375, "grad_norm": 3.46875, "grad_norm_var": 0.07172749837239584, "learning_rate": 0.0001, "loss": 8.6884, "loss/crossentropy": 2.459862232208252, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2846823185682297, "step": 1686 }, { "epoch": 0.1055, "grad_norm": 3.984375, "grad_norm_var": 0.046418253580729166, "learning_rate": 0.0001, "loss": 8.5572, "loss/crossentropy": 2.2836594581604004, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2825407385826111, "step": 1688 }, { "epoch": 0.105625, "grad_norm": 3.28125, "grad_norm_var": 0.04759012858072917, "learning_rate": 0.0001, "loss": 8.8016, "loss/crossentropy": 2.0917385816574097, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.290659636259079, "step": 1690 }, { "epoch": 0.10575, "grad_norm": 3.265625, "grad_norm_var": 0.05226949055989583, "learning_rate": 0.0001, "loss": 8.6239, "loss/crossentropy": 2.3286694288253784, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.29725848138332367, "step": 1692 }, { "epoch": 0.105875, "grad_norm": 3.4375, "grad_norm_var": 0.047093709309895836, "learning_rate": 0.0001, "loss": 8.6667, "loss/crossentropy": 2.1655561327934265, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2947119176387787, "step": 1694 }, { "epoch": 0.106, "grad_norm": 3.578125, "grad_norm_var": 0.04413655598958333, "learning_rate": 0.0001, "loss": 8.9215, "loss/crossentropy": 2.4700855016708374, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2996203601360321, "step": 1696 }, { "epoch": 0.106125, "grad_norm": 3.34375, "grad_norm_var": 0.0376129150390625, "learning_rate": 0.0001, "loss": 8.6172, "loss/crossentropy": 2.4698420763015747, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2655749022960663, "step": 1698 }, { "epoch": 0.10625, "grad_norm": 3.546875, "grad_norm_var": 0.03808186848958333, "learning_rate": 0.0001, "loss": 8.5207, "loss/crossentropy": 2.3951499462127686, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.3072836995124817, "step": 1700 }, { "epoch": 0.106375, "grad_norm": 3.515625, "grad_norm_var": 0.038182576497395836, "learning_rate": 0.0001, "loss": 8.6023, "loss/crossentropy": 2.334869146347046, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2989247292280197, "step": 1702 }, { "epoch": 0.1065, "grad_norm": 4.6875, "grad_norm_var": 0.11238606770833333, "learning_rate": 0.0001, "loss": 8.6929, "loss/crossentropy": 2.1444605588912964, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.31034591794013977, "step": 1704 }, { "epoch": 0.106625, "grad_norm": 3.359375, "grad_norm_var": 0.11077473958333334, "learning_rate": 0.0001, "loss": 8.6212, "loss/crossentropy": 2.298153877258301, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.27205270528793335, "step": 1706 }, { "epoch": 0.10675, "grad_norm": 3.65625, "grad_norm_var": 0.10321858723958334, "learning_rate": 0.0001, "loss": 8.6963, "loss/crossentropy": 2.4254097938537598, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.32095080614089966, "step": 1708 }, { "epoch": 0.106875, "grad_norm": 3.5, "grad_norm_var": 0.10187174479166666, "learning_rate": 0.0001, "loss": 8.4695, "loss/crossentropy": 2.1711431741714478, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2679228186607361, "step": 1710 }, { "epoch": 0.107, "grad_norm": 3.3125, "grad_norm_var": 0.10679423014322917, "learning_rate": 0.0001, "loss": 8.3566, "loss/crossentropy": 2.1170949935913086, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.3108642250299454, "step": 1712 }, { "epoch": 0.107125, "grad_norm": 3.109375, "grad_norm_var": 0.11456705729166666, "learning_rate": 0.0001, "loss": 8.644, "loss/crossentropy": 2.5449509620666504, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.3044998347759247, "step": 1714 }, { "epoch": 0.10725, "grad_norm": 4.0625, "grad_norm_var": 0.13430582682291667, "learning_rate": 0.0001, "loss": 8.8835, "loss/crossentropy": 2.4760228395462036, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.32176442444324493, "step": 1716 }, { "epoch": 0.107375, "grad_norm": 3.53125, "grad_norm_var": 0.13414306640625, "learning_rate": 0.0001, "loss": 8.6377, "loss/crossentropy": 2.3086230754852295, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.3137579560279846, "step": 1718 }, { "epoch": 0.1075, "grad_norm": 3.59375, "grad_norm_var": 0.04918212890625, "learning_rate": 0.0001, "loss": 8.9384, "loss/crossentropy": 2.650801420211792, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.31000974774360657, "step": 1720 }, { "epoch": 0.107625, "grad_norm": 3.765625, "grad_norm_var": 0.054248046875, "learning_rate": 0.0001, "loss": 9.2105, "loss/crossentropy": 2.595288038253784, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3103038817644119, "step": 1722 }, { "epoch": 0.10775, "grad_norm": 3.3125, "grad_norm_var": 0.057038370768229166, "learning_rate": 0.0001, "loss": 8.8991, "loss/crossentropy": 2.5326555967330933, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.3022647351026535, "step": 1724 }, { "epoch": 0.107875, "grad_norm": 3.703125, "grad_norm_var": 0.059403483072916666, "learning_rate": 0.0001, "loss": 8.8281, "loss/crossentropy": 2.4913820028305054, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2860126197338104, "step": 1726 }, { "epoch": 0.108, "grad_norm": 3.21875, "grad_norm_var": 0.06424153645833333, "learning_rate": 0.0001, "loss": 8.7805, "loss/crossentropy": 2.1372629404067993, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28189751505851746, "step": 1728 }, { "epoch": 0.108125, "grad_norm": 3.625, "grad_norm_var": 0.0627593994140625, "learning_rate": 0.0001, "loss": 8.8053, "loss/crossentropy": 2.432206392288208, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.35032832622528076, "step": 1730 }, { "epoch": 0.10825, "grad_norm": 3.5, "grad_norm_var": 0.0415191650390625, "learning_rate": 0.0001, "loss": 8.6348, "loss/crossentropy": 2.4481922388076782, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.28543633222579956, "step": 1732 }, { "epoch": 0.108375, "grad_norm": 3.375, "grad_norm_var": 0.04326171875, "learning_rate": 0.0001, "loss": 8.691, "loss/crossentropy": 2.38582444190979, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3336361348628998, "step": 1734 }, { "epoch": 0.1085, "grad_norm": 3.5, "grad_norm_var": 0.05136311848958333, "learning_rate": 0.0001, "loss": 8.5918, "loss/crossentropy": 2.606018543243408, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.30366943776607513, "step": 1736 }, { "epoch": 0.108625, "grad_norm": 3.59375, "grad_norm_var": 0.03434956868489583, "learning_rate": 0.0001, "loss": 8.7135, "loss/crossentropy": 2.3895175457000732, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.3109103590250015, "step": 1738 }, { "epoch": 0.10875, "grad_norm": 3.203125, "grad_norm_var": 0.037328084309895836, "learning_rate": 0.0001, "loss": 8.4959, "loss/crossentropy": 2.0861340761184692, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28380608558654785, "step": 1740 }, { "epoch": 0.108875, "grad_norm": 3.484375, "grad_norm_var": 0.03951416015625, "learning_rate": 0.0001, "loss": 8.7312, "loss/crossentropy": 2.449671506881714, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.28429578244686127, "step": 1742 }, { "epoch": 0.109, "grad_norm": 3.328125, "grad_norm_var": 0.037385050455729166, "learning_rate": 0.0001, "loss": 8.4461, "loss/crossentropy": 2.4647138118743896, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2784615755081177, "step": 1744 }, { "epoch": 0.109125, "grad_norm": 3.3125, "grad_norm_var": 0.033177693684895836, "learning_rate": 0.0001, "loss": 8.7449, "loss/crossentropy": 2.3156272172927856, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2726980447769165, "step": 1746 }, { "epoch": 0.10925, "grad_norm": 3.40625, "grad_norm_var": 0.03411458333333333, "learning_rate": 0.0001, "loss": 8.613, "loss/crossentropy": 1.9271941781044006, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2630353271961212, "step": 1748 }, { "epoch": 0.109375, "grad_norm": 3.28125, "grad_norm_var": 0.03327534993489583, "learning_rate": 0.0001, "loss": 8.501, "loss/crossentropy": 2.2124346494674683, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.29161450266838074, "step": 1750 }, { "epoch": 0.1095, "grad_norm": 3.40625, "grad_norm_var": 0.0279296875, "learning_rate": 0.0001, "loss": 8.8103, "loss/crossentropy": 2.3019338846206665, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.29614363610744476, "step": 1752 }, { "epoch": 0.109625, "grad_norm": 3.875, "grad_norm_var": 0.0394927978515625, "learning_rate": 0.0001, "loss": 8.6762, "loss/crossentropy": 2.299665927886963, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2929713726043701, "step": 1754 }, { "epoch": 0.10975, "grad_norm": 3.9375, "grad_norm_var": 0.05947163899739583, "learning_rate": 0.0001, "loss": 8.7112, "loss/crossentropy": 2.4006038904190063, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2871282994747162, "step": 1756 }, { "epoch": 0.109875, "grad_norm": 3.5, "grad_norm_var": 0.04921875, "learning_rate": 0.0001, "loss": 8.6092, "loss/crossentropy": 2.2746243476867676, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3178953379392624, "step": 1758 }, { "epoch": 0.11, "grad_norm": 3.21875, "grad_norm_var": 0.05380859375, "learning_rate": 0.0001, "loss": 8.4347, "loss/crossentropy": 2.152518630027771, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.308669313788414, "step": 1760 }, { "epoch": 0.110125, "grad_norm": 3.15625, "grad_norm_var": 0.05754801432291667, "learning_rate": 0.0001, "loss": 8.515, "loss/crossentropy": 2.313698410987854, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.312808558344841, "step": 1762 }, { "epoch": 0.11025, "grad_norm": 3.515625, "grad_norm_var": 0.05791015625, "learning_rate": 0.0001, "loss": 8.4938, "loss/crossentropy": 2.2726303339004517, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2869200110435486, "step": 1764 }, { "epoch": 0.110375, "grad_norm": 3.671875, "grad_norm_var": 0.05747782389322917, "learning_rate": 0.0001, "loss": 8.547, "loss/crossentropy": 2.309714913368225, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2861059010028839, "step": 1766 }, { "epoch": 0.1105, "grad_norm": 3.5, "grad_norm_var": 0.056493123372395836, "learning_rate": 0.0001, "loss": 8.7686, "loss/crossentropy": 2.429977536201477, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.30839361250400543, "step": 1768 }, { "epoch": 0.110625, "grad_norm": 3.421875, "grad_norm_var": 0.049437459309895834, "learning_rate": 0.0001, "loss": 8.486, "loss/crossentropy": 2.1271519660949707, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.26674604415893555, "step": 1770 }, { "epoch": 0.11075, "grad_norm": 3.390625, "grad_norm_var": 0.02265625, "learning_rate": 0.0001, "loss": 8.6498, "loss/crossentropy": 2.4554080963134766, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2772922068834305, "step": 1772 }, { "epoch": 0.110875, "grad_norm": 3.359375, "grad_norm_var": 0.021906534830729168, "learning_rate": 0.0001, "loss": 8.4247, "loss/crossentropy": 2.179062843322754, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2816423773765564, "step": 1774 }, { "epoch": 0.111, "grad_norm": 3.4375, "grad_norm_var": 0.0189117431640625, "learning_rate": 0.0001, "loss": 8.6942, "loss/crossentropy": 2.2097833156585693, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.30545778572559357, "step": 1776 }, { "epoch": 0.111125, "grad_norm": 3.46875, "grad_norm_var": 0.0134429931640625, "learning_rate": 0.0001, "loss": 8.5308, "loss/crossentropy": 2.35193407535553, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.27761800587177277, "step": 1778 }, { "epoch": 0.11125, "grad_norm": 4.0625, "grad_norm_var": 0.0426666259765625, "learning_rate": 0.0001, "loss": 8.7591, "loss/crossentropy": 2.8923500776290894, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.31674572825431824, "step": 1780 }, { "epoch": 0.111375, "grad_norm": 3.390625, "grad_norm_var": 0.04739176432291667, "learning_rate": 0.0001, "loss": 8.7339, "loss/crossentropy": 2.4018853902816772, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3184930384159088, "step": 1782 }, { "epoch": 0.1115, "grad_norm": 3.109375, "grad_norm_var": 0.0567047119140625, "learning_rate": 0.0001, "loss": 8.5982, "loss/crossentropy": 2.486867070198059, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28121723234653473, "step": 1784 }, { "epoch": 0.111625, "grad_norm": 3.59375, "grad_norm_var": 0.05657145182291667, "learning_rate": 0.0001, "loss": 8.9316, "loss/crossentropy": 2.6467264890670776, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.33492469787597656, "step": 1786 }, { "epoch": 0.11175, "grad_norm": 3.40625, "grad_norm_var": 0.05624593098958333, "learning_rate": 0.0001, "loss": 8.7063, "loss/crossentropy": 2.251604914665222, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.30116818845272064, "step": 1788 }, { "epoch": 0.111875, "grad_norm": 3.5, "grad_norm_var": 0.056966145833333336, "learning_rate": 0.0001, "loss": 8.4636, "loss/crossentropy": 2.1624085903167725, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2598511800169945, "step": 1790 }, { "epoch": 0.112, "grad_norm": 3.59375, "grad_norm_var": 0.056818644205729164, "learning_rate": 0.0001, "loss": 8.4571, "loss/crossentropy": 2.0495232343673706, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.24547497928142548, "step": 1792 }, { "epoch": 0.112125, "grad_norm": 3.484375, "grad_norm_var": 0.059794108072916664, "learning_rate": 0.0001, "loss": 8.8322, "loss/crossentropy": 2.664864659309387, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.30460360646247864, "step": 1794 }, { "epoch": 0.11225, "grad_norm": 3.515625, "grad_norm_var": 0.0298492431640625, "learning_rate": 0.0001, "loss": 8.646, "loss/crossentropy": 2.494004487991333, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2763901799917221, "step": 1796 }, { "epoch": 0.112375, "grad_norm": 3.140625, "grad_norm_var": 0.032515462239583334, "learning_rate": 0.0001, "loss": 8.3741, "loss/crossentropy": 2.1645578145980835, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2648225873708725, "step": 1798 }, { "epoch": 0.1125, "grad_norm": 3.453125, "grad_norm_var": 0.024413045247395834, "learning_rate": 0.0001, "loss": 8.4112, "loss/crossentropy": 2.09485399723053, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2805396020412445, "step": 1800 }, { "epoch": 0.112625, "grad_norm": 3.453125, "grad_norm_var": 0.022313435872395832, "learning_rate": 0.0001, "loss": 8.6201, "loss/crossentropy": 2.369633436203003, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2918958216905594, "step": 1802 }, { "epoch": 0.11275, "grad_norm": 3.671875, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 8.4743, "loss/crossentropy": 2.522893786430359, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2649436295032501, "step": 1804 }, { "epoch": 0.112875, "grad_norm": 3.65625, "grad_norm_var": 0.03322652180989583, "learning_rate": 0.0001, "loss": 8.6938, "loss/crossentropy": 2.4980560541152954, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.3046301603317261, "step": 1806 }, { "epoch": 0.113, "grad_norm": 3.765625, "grad_norm_var": 0.07986551920572917, "learning_rate": 0.0001, "loss": 8.6725, "loss/crossentropy": 2.320141911506653, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3137675076723099, "step": 1808 }, { "epoch": 0.113125, "grad_norm": 3.71875, "grad_norm_var": 0.08251546223958334, "learning_rate": 0.0001, "loss": 8.7972, "loss/crossentropy": 2.2626017332077026, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2625807821750641, "step": 1810 }, { "epoch": 0.11325, "grad_norm": 3.734375, "grad_norm_var": 0.08018290201822917, "learning_rate": 0.0001, "loss": 8.6687, "loss/crossentropy": 2.136113405227661, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30796822905540466, "step": 1812 }, { "epoch": 0.113375, "grad_norm": 5.5625, "grad_norm_var": 0.3128326416015625, "learning_rate": 0.0001, "loss": 9.0458, "loss/crossentropy": 2.75032639503479, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.31279677152633667, "step": 1814 }, { "epoch": 0.1135, "grad_norm": 4.9375, "grad_norm_var": 0.3737457275390625, "learning_rate": 0.0001, "loss": 9.1144, "loss/crossentropy": 2.538609027862549, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.32450392842292786, "step": 1816 }, { "epoch": 0.113625, "grad_norm": 5.0625, "grad_norm_var": 0.44909566243489585, "learning_rate": 0.0001, "loss": 8.8125, "loss/crossentropy": 2.322131633758545, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3943719118833542, "step": 1818 }, { "epoch": 0.11375, "grad_norm": 3.453125, "grad_norm_var": 0.445068359375, "learning_rate": 0.0001, "loss": 8.4801, "loss/crossentropy": 2.363473057746887, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.30813442170619965, "step": 1820 }, { "epoch": 0.113875, "grad_norm": 3.15625, "grad_norm_var": 0.4840169270833333, "learning_rate": 0.0001, "loss": 8.5864, "loss/crossentropy": 2.3755160570144653, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2846773564815521, "step": 1822 }, { "epoch": 0.114, "grad_norm": 3.21875, "grad_norm_var": 0.5322580973307292, "learning_rate": 0.0001, "loss": 8.4673, "loss/crossentropy": 2.3385982513427734, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2795634865760803, "step": 1824 }, { "epoch": 0.114125, "grad_norm": 3.4375, "grad_norm_var": 0.5356730143229167, "learning_rate": 0.0001, "loss": 8.5786, "loss/crossentropy": 2.5109734535217285, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28478285670280457, "step": 1826 }, { "epoch": 0.11425, "grad_norm": 3.46875, "grad_norm_var": 0.5471995035807292, "learning_rate": 0.0001, "loss": 8.6909, "loss/crossentropy": 2.5854321718215942, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.31594397127628326, "step": 1828 }, { "epoch": 0.114375, "grad_norm": 3.21875, "grad_norm_var": 0.3377349853515625, "learning_rate": 0.0001, "loss": 8.5017, "loss/crossentropy": 2.3423889875411987, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.3003038465976715, "step": 1830 }, { "epoch": 0.1145, "grad_norm": 3.578125, "grad_norm_var": 0.2010406494140625, "learning_rate": 0.0001, "loss": 8.8625, "loss/crossentropy": 2.612884998321533, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.28992408514022827, "step": 1832 }, { "epoch": 0.114625, "grad_norm": 3.265625, "grad_norm_var": 0.026301066080729168, "learning_rate": 0.0001, "loss": 8.6717, "loss/crossentropy": 2.2408013343811035, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27288930118083954, "step": 1834 }, { "epoch": 0.11475, "grad_norm": 3.3125, "grad_norm_var": 0.023957316080729166, "learning_rate": 0.0001, "loss": 8.7519, "loss/crossentropy": 2.5239880084991455, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3304894417524338, "step": 1836 }, { "epoch": 0.114875, "grad_norm": 3.453125, "grad_norm_var": 0.017365519205729166, "learning_rate": 0.0001, "loss": 8.5288, "loss/crossentropy": 2.295746088027954, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.28775452077388763, "step": 1838 }, { "epoch": 0.115, "grad_norm": 3.421875, "grad_norm_var": 0.020198567708333334, "learning_rate": 0.0001, "loss": 8.5755, "loss/crossentropy": 2.367674708366394, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2802489101886749, "step": 1840 }, { "epoch": 0.115125, "grad_norm": 3.1875, "grad_norm_var": 0.022777303059895834, "learning_rate": 0.0001, "loss": 8.6102, "loss/crossentropy": 2.387988567352295, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2998049259185791, "step": 1842 }, { "epoch": 0.11525, "grad_norm": 3.0625, "grad_norm_var": 0.0284576416015625, "learning_rate": 0.0001, "loss": 8.4168, "loss/crossentropy": 2.1790847778320312, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2700785994529724, "step": 1844 }, { "epoch": 0.115375, "grad_norm": 3.46875, "grad_norm_var": 0.024803670247395833, "learning_rate": 0.0001, "loss": 8.7029, "loss/crossentropy": 2.5277167558670044, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3007604479789734, "step": 1846 }, { "epoch": 0.1155, "grad_norm": 3.3125, "grad_norm_var": 0.021043904622395835, "learning_rate": 0.0001, "loss": 8.7105, "loss/crossentropy": 2.417070746421814, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.30565008521080017, "step": 1848 }, { "epoch": 0.115625, "grad_norm": 3.421875, "grad_norm_var": 0.03267313639322917, "learning_rate": 0.0001, "loss": 8.7572, "loss/crossentropy": 2.581299304962158, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2796669900417328, "step": 1850 }, { "epoch": 0.11575, "grad_norm": 3.46875, "grad_norm_var": 0.03391825358072917, "learning_rate": 0.0001, "loss": 8.7661, "loss/crossentropy": 2.385110855102539, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2876836359500885, "step": 1852 }, { "epoch": 0.115875, "grad_norm": 3.59375, "grad_norm_var": 0.036637369791666666, "learning_rate": 0.0001, "loss": 9.1625, "loss/crossentropy": 2.503122568130493, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3233593702316284, "step": 1854 }, { "epoch": 0.116, "grad_norm": 3.453125, "grad_norm_var": 0.03401590983072917, "learning_rate": 0.0001, "loss": 8.964, "loss/crossentropy": 2.4058728218078613, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.349610835313797, "step": 1856 }, { "epoch": 0.116125, "grad_norm": 3.6875, "grad_norm_var": 0.032796223958333336, "learning_rate": 0.0001, "loss": 8.6335, "loss/crossentropy": 2.472745180130005, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28807832300662994, "step": 1858 }, { "epoch": 0.11625, "grad_norm": 3.171875, "grad_norm_var": 0.026276652018229166, "learning_rate": 0.0001, "loss": 8.6263, "loss/crossentropy": 2.377658724784851, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2969461679458618, "step": 1860 }, { "epoch": 0.116375, "grad_norm": 3.46875, "grad_norm_var": 0.030744425455729165, "learning_rate": 0.0001, "loss": 8.523, "loss/crossentropy": 2.261406660079956, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2751418948173523, "step": 1862 }, { "epoch": 0.1165, "grad_norm": 3.734375, "grad_norm_var": 0.033722941080729166, "learning_rate": 0.0001, "loss": 8.552, "loss/crossentropy": 2.3539315462112427, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2792647182941437, "step": 1864 }, { "epoch": 0.116625, "grad_norm": 3.375, "grad_norm_var": 0.026949055989583335, "learning_rate": 0.0001, "loss": 8.5773, "loss/crossentropy": 2.5640159845352173, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2862507700920105, "step": 1866 }, { "epoch": 0.11675, "grad_norm": 3.203125, "grad_norm_var": 0.032013956705729166, "learning_rate": 0.0001, "loss": 8.489, "loss/crossentropy": 2.419552803039551, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28335337340831757, "step": 1868 }, { "epoch": 0.116875, "grad_norm": 3.421875, "grad_norm_var": 0.029059855143229167, "learning_rate": 0.0001, "loss": 8.2858, "loss/crossentropy": 2.206141471862793, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2972014546394348, "step": 1870 }, { "epoch": 0.117, "grad_norm": 3.5625, "grad_norm_var": 0.03101806640625, "learning_rate": 0.0001, "loss": 8.4301, "loss/crossentropy": 2.2040516138076782, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28033290803432465, "step": 1872 }, { "epoch": 0.117125, "grad_norm": 3.40625, "grad_norm_var": 0.0343902587890625, "learning_rate": 0.0001, "loss": 8.6147, "loss/crossentropy": 2.4695621728897095, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.28286705911159515, "step": 1874 }, { "epoch": 0.11725, "grad_norm": 3.53125, "grad_norm_var": 0.03194986979166667, "learning_rate": 0.0001, "loss": 8.6246, "loss/crossentropy": 2.3480403423309326, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.29076308012008667, "step": 1876 }, { "epoch": 0.117375, "grad_norm": 3.1875, "grad_norm_var": 0.033991495768229164, "learning_rate": 0.0001, "loss": 8.6359, "loss/crossentropy": 2.3183737993240356, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.29951538145542145, "step": 1878 }, { "epoch": 0.1175, "grad_norm": 3.765625, "grad_norm_var": 0.03533528645833333, "learning_rate": 0.0001, "loss": 8.7296, "loss/crossentropy": 2.259859085083008, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28555014729499817, "step": 1880 }, { "epoch": 0.117625, "grad_norm": 3.203125, "grad_norm_var": 0.03573811848958333, "learning_rate": 0.0001, "loss": 8.5874, "loss/crossentropy": 2.5705759525299072, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2966092526912689, "step": 1882 }, { "epoch": 0.11775, "grad_norm": 3.546875, "grad_norm_var": 0.034619140625, "learning_rate": 0.0001, "loss": 8.6653, "loss/crossentropy": 2.324189782142639, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28070059418678284, "step": 1884 }, { "epoch": 0.117875, "grad_norm": 4.09375, "grad_norm_var": 0.06123046875, "learning_rate": 0.0001, "loss": 8.9445, "loss/crossentropy": 2.6003164052963257, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.42599718272686005, "step": 1886 }, { "epoch": 0.118, "grad_norm": 3.359375, "grad_norm_var": 0.06826070149739584, "learning_rate": 0.0001, "loss": 8.4642, "loss/crossentropy": 2.1944313049316406, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27223484218120575, "step": 1888 }, { "epoch": 0.118125, "grad_norm": 3.328125, "grad_norm_var": 0.0644683837890625, "learning_rate": 0.0001, "loss": 8.6562, "loss/crossentropy": 2.445297122001648, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.29588577151298523, "step": 1890 }, { "epoch": 0.11825, "grad_norm": 3.421875, "grad_norm_var": 0.08600260416666666, "learning_rate": 0.0001, "loss": 8.6856, "loss/crossentropy": 2.2999027371406555, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2753405198454857, "step": 1892 }, { "epoch": 0.118375, "grad_norm": 3.390625, "grad_norm_var": 0.07831929524739584, "learning_rate": 0.0001, "loss": 8.5229, "loss/crossentropy": 2.399373769760132, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2927252948284149, "step": 1894 }, { "epoch": 0.1185, "grad_norm": 3.203125, "grad_norm_var": 0.07942606608072916, "learning_rate": 0.0001, "loss": 8.5229, "loss/crossentropy": 2.4299787282943726, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.30517131090164185, "step": 1896 }, { "epoch": 0.118625, "grad_norm": 3.453125, "grad_norm_var": 0.07559305826822917, "learning_rate": 0.0001, "loss": 8.4977, "loss/crossentropy": 1.9518161416053772, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27373121678829193, "step": 1898 }, { "epoch": 0.11875, "grad_norm": 3.734375, "grad_norm_var": 0.07967020670572916, "learning_rate": 0.0001, "loss": 8.4314, "loss/crossentropy": 2.2160075902938843, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2800438925623894, "step": 1900 }, { "epoch": 0.118875, "grad_norm": 3.171875, "grad_norm_var": 0.052912394205729164, "learning_rate": 0.0001, "loss": 8.4264, "loss/crossentropy": 2.2488516569137573, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2859164774417877, "step": 1902 }, { "epoch": 0.119, "grad_norm": 3.390625, "grad_norm_var": 0.04648030598958333, "learning_rate": 0.0001, "loss": 8.7733, "loss/crossentropy": 2.298749089241028, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3013346642255783, "step": 1904 }, { "epoch": 0.119125, "grad_norm": 3.34375, "grad_norm_var": 0.04391276041666667, "learning_rate": 0.0001, "loss": 8.607, "loss/crossentropy": 2.5380003452301025, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2967790812253952, "step": 1906 }, { "epoch": 0.11925, "grad_norm": 3.828125, "grad_norm_var": 0.03439127604166667, "learning_rate": 0.0001, "loss": 8.673, "loss/crossentropy": 2.4969903230667114, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.34014879167079926, "step": 1908 }, { "epoch": 0.119375, "grad_norm": 3.734375, "grad_norm_var": 0.04801025390625, "learning_rate": 0.0001, "loss": 8.9499, "loss/crossentropy": 2.326760768890381, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3319539427757263, "step": 1910 }, { "epoch": 0.1195, "grad_norm": 3.46875, "grad_norm_var": 0.04390869140625, "learning_rate": 0.0001, "loss": 8.983, "loss/crossentropy": 2.3548574447631836, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3132203370332718, "step": 1912 }, { "epoch": 0.119625, "grad_norm": 3.25, "grad_norm_var": 0.04912109375, "learning_rate": 0.0001, "loss": 8.4837, "loss/crossentropy": 2.301589846611023, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.28266826272010803, "step": 1914 }, { "epoch": 0.11975, "grad_norm": 3.3125, "grad_norm_var": 0.0535797119140625, "learning_rate": 0.0001, "loss": 9.0105, "loss/crossentropy": 2.2812716960906982, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.3110269755125046, "step": 1916 }, { "epoch": 0.119875, "grad_norm": 3.65625, "grad_norm_var": 0.05188700358072917, "learning_rate": 0.0001, "loss": 8.7659, "loss/crossentropy": 2.4299668073654175, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2899664491415024, "step": 1918 }, { "epoch": 0.12, "grad_norm": 3.421875, "grad_norm_var": 0.059403483072916666, "learning_rate": 0.0001, "loss": 8.6172, "loss/crossentropy": 2.2184523940086365, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2705395221710205, "step": 1920 }, { "epoch": 0.120125, "grad_norm": 3.28125, "grad_norm_var": 0.061742146809895836, "learning_rate": 0.0001, "loss": 8.6808, "loss/crossentropy": 2.361995577812195, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2976345121860504, "step": 1922 }, { "epoch": 0.12025, "grad_norm": 3.140625, "grad_norm_var": 0.06204427083333333, "learning_rate": 0.0001, "loss": 8.572, "loss/crossentropy": 2.0673895478248596, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2614962309598923, "step": 1924 }, { "epoch": 0.120375, "grad_norm": 3.265625, "grad_norm_var": 0.050324503580729166, "learning_rate": 0.0001, "loss": 8.3519, "loss/crossentropy": 2.333189368247986, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2846105396747589, "step": 1926 }, { "epoch": 0.1205, "grad_norm": 3.921875, "grad_norm_var": 0.07100321451822916, "learning_rate": 0.0001, "loss": 8.5546, "loss/crossentropy": 2.3604001998901367, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2743445485830307, "step": 1928 }, { "epoch": 0.120625, "grad_norm": 3.15625, "grad_norm_var": 0.07200113932291667, "learning_rate": 0.0001, "loss": 8.4642, "loss/crossentropy": 2.3226892948150635, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2899642139673233, "step": 1930 }, { "epoch": 0.12075, "grad_norm": 3.140625, "grad_norm_var": 0.06077372233072917, "learning_rate": 0.0001, "loss": 8.4469, "loss/crossentropy": 2.2486273050308228, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.29645949602127075, "step": 1932 }, { "epoch": 0.120875, "grad_norm": 3.375, "grad_norm_var": 0.04622294108072917, "learning_rate": 0.0001, "loss": 8.4223, "loss/crossentropy": 2.590661644935608, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2843063771724701, "step": 1934 }, { "epoch": 0.121, "grad_norm": 2.984375, "grad_norm_var": 0.05113525390625, "learning_rate": 0.0001, "loss": 8.5287, "loss/crossentropy": 2.392248034477234, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.28749269247055054, "step": 1936 }, { "epoch": 0.121125, "grad_norm": 3.5, "grad_norm_var": 0.05465087890625, "learning_rate": 0.0001, "loss": 8.4881, "loss/crossentropy": 2.3735626935958862, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3381284326314926, "step": 1938 }, { "epoch": 0.12125, "grad_norm": 3.953125, "grad_norm_var": 0.07750244140625, "learning_rate": 0.0001, "loss": 8.7097, "loss/crossentropy": 2.5563061237335205, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.3243313729763031, "step": 1940 }, { "epoch": 0.121375, "grad_norm": 3.078125, "grad_norm_var": 0.0794342041015625, "learning_rate": 0.0001, "loss": 8.6438, "loss/crossentropy": 2.2933984994888306, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.29726622998714447, "step": 1942 }, { "epoch": 0.1215, "grad_norm": 3.390625, "grad_norm_var": 0.06724344889322917, "learning_rate": 0.0001, "loss": 8.4159, "loss/crossentropy": 2.420395255088806, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2696368545293808, "step": 1944 }, { "epoch": 0.121625, "grad_norm": 3.15625, "grad_norm_var": 0.06702067057291666, "learning_rate": 0.0001, "loss": 8.5616, "loss/crossentropy": 2.1465864777565002, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2801681458950043, "step": 1946 }, { "epoch": 0.12175, "grad_norm": 3.515625, "grad_norm_var": 0.11573893229166667, "learning_rate": 0.0001, "loss": 8.8623, "loss/crossentropy": 2.454163372516632, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.27878041565418243, "step": 1948 }, { "epoch": 0.121875, "grad_norm": 3.40625, "grad_norm_var": 0.12135009765625, "learning_rate": 0.0001, "loss": 8.2899, "loss/crossentropy": 2.143743395805359, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2612776756286621, "step": 1950 }, { "epoch": 0.122, "grad_norm": 3.71875, "grad_norm_var": 0.13105367024739584, "learning_rate": 0.0001, "loss": 9.0178, "loss/crossentropy": 2.4097328186035156, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3114086985588074, "step": 1952 }, { "epoch": 0.122125, "grad_norm": 5.15625, "grad_norm_var": 0.2849355061848958, "learning_rate": 0.0001, "loss": 8.7298, "loss/crossentropy": 2.3883849382400513, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.3800114840269089, "step": 1954 }, { "epoch": 0.12225, "grad_norm": 3.15625, "grad_norm_var": 0.31787007649739585, "learning_rate": 0.0001, "loss": 8.4569, "loss/crossentropy": 2.2481584548950195, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2927142381668091, "step": 1956 }, { "epoch": 0.122375, "grad_norm": 3.34375, "grad_norm_var": 0.3129058837890625, "learning_rate": 0.0001, "loss": 8.5422, "loss/crossentropy": 2.096981406211853, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27731966972351074, "step": 1958 }, { "epoch": 0.1225, "grad_norm": 3.4375, "grad_norm_var": 0.2894205729166667, "learning_rate": 0.0001, "loss": 8.5498, "loss/crossentropy": 2.531270146369934, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.265569344162941, "step": 1960 }, { "epoch": 0.122625, "grad_norm": 3.25, "grad_norm_var": 0.28873291015625, "learning_rate": 0.0001, "loss": 8.521, "loss/crossentropy": 2.6065129041671753, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2991243004798889, "step": 1962 }, { "epoch": 0.12275, "grad_norm": 3.546875, "grad_norm_var": 0.2656158447265625, "learning_rate": 0.0001, "loss": 8.5729, "loss/crossentropy": 2.261076807975769, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28016628324985504, "step": 1964 }, { "epoch": 0.122875, "grad_norm": 3.34375, "grad_norm_var": 0.2662750244140625, "learning_rate": 0.0001, "loss": 8.7329, "loss/crossentropy": 2.4433281421661377, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2746448367834091, "step": 1966 }, { "epoch": 0.123, "grad_norm": 3.625, "grad_norm_var": 0.3149810791015625, "learning_rate": 0.0001, "loss": 8.7222, "loss/crossentropy": 2.2890138626098633, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.29224395751953125, "step": 1968 }, { "epoch": 0.123125, "grad_norm": 3.21875, "grad_norm_var": 0.15416259765625, "learning_rate": 0.0001, "loss": 8.3613, "loss/crossentropy": 2.1031445264816284, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2901540696620941, "step": 1970 }, { "epoch": 0.12325, "grad_norm": 3.453125, "grad_norm_var": 0.11484273274739583, "learning_rate": 0.0001, "loss": 8.4639, "loss/crossentropy": 2.240355134010315, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27891701459884644, "step": 1972 }, { "epoch": 0.123375, "grad_norm": 3.25, "grad_norm_var": 0.11433919270833333, "learning_rate": 0.0001, "loss": 8.6588, "loss/crossentropy": 2.526862621307373, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.31067635118961334, "step": 1974 }, { "epoch": 0.1235, "grad_norm": 3.078125, "grad_norm_var": 0.12138570149739583, "learning_rate": 0.0001, "loss": 8.4162, "loss/crossentropy": 2.389139413833618, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.3020573705434799, "step": 1976 }, { "epoch": 0.123625, "grad_norm": 3.59375, "grad_norm_var": 0.12506103515625, "learning_rate": 0.0001, "loss": 8.4219, "loss/crossentropy": 2.438482403755188, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.26280972361564636, "step": 1978 }, { "epoch": 0.12375, "grad_norm": 3.21875, "grad_norm_var": 0.12905985514322918, "learning_rate": 0.0001, "loss": 8.7076, "loss/crossentropy": 2.342813014984131, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2951369285583496, "step": 1980 }, { "epoch": 0.123875, "grad_norm": 3.296875, "grad_norm_var": 0.13946024576822916, "learning_rate": 0.0001, "loss": 8.4449, "loss/crossentropy": 2.1303473114967346, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26047107577323914, "step": 1982 }, { "epoch": 0.124, "grad_norm": 3.140625, "grad_norm_var": 0.030985514322916668, "learning_rate": 0.0001, "loss": 8.5657, "loss/crossentropy": 2.470989942550659, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.289617583155632, "step": 1984 }, { "epoch": 0.124125, "grad_norm": 3.59375, "grad_norm_var": 0.03736572265625, "learning_rate": 0.0001, "loss": 8.541, "loss/crossentropy": 2.4312928915023804, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2940659523010254, "step": 1986 }, { "epoch": 0.12425, "grad_norm": 3.109375, "grad_norm_var": 0.0355865478515625, "learning_rate": 0.0001, "loss": 8.5967, "loss/crossentropy": 2.408700704574585, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.29635827243328094, "step": 1988 }, { "epoch": 0.124375, "grad_norm": 3.09375, "grad_norm_var": 0.0367095947265625, "learning_rate": 0.0001, "loss": 8.5236, "loss/crossentropy": 2.222353756427765, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2752675265073776, "step": 1990 }, { "epoch": 0.1245, "grad_norm": 3.390625, "grad_norm_var": 0.03466796875, "learning_rate": 0.0001, "loss": 8.7415, "loss/crossentropy": 2.5249141454696655, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.31805843114852905, "step": 1992 }, { "epoch": 0.124625, "grad_norm": 3.296875, "grad_norm_var": 0.0289947509765625, "learning_rate": 0.0001, "loss": 8.3748, "loss/crossentropy": 1.9848942756652832, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2688567340373993, "step": 1994 }, { "epoch": 0.12475, "grad_norm": 3.453125, "grad_norm_var": 0.02486572265625, "learning_rate": 0.0001, "loss": 8.8413, "loss/crossentropy": 2.657674193382263, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.27856191992759705, "step": 1996 }, { "epoch": 0.124875, "grad_norm": 3.171875, "grad_norm_var": 0.023688761393229167, "learning_rate": 0.0001, "loss": 8.3386, "loss/crossentropy": 2.289615035057068, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27829277515411377, "step": 1998 }, { "epoch": 0.125, "grad_norm": 3.40625, "grad_norm_var": 0.0205230712890625, "learning_rate": 0.0001, "loss": 8.5863, "loss/crossentropy": 2.5303882360458374, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2958512306213379, "step": 2000 }, { "epoch": 0.125125, "grad_norm": 2.921875, "grad_norm_var": 0.025028483072916666, "learning_rate": 0.0001, "loss": 8.3451, "loss/crossentropy": 2.200709104537964, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26912133395671844, "step": 2002 }, { "epoch": 0.12525, "grad_norm": 3.203125, "grad_norm_var": 0.0232330322265625, "learning_rate": 0.0001, "loss": 8.3675, "loss/crossentropy": 2.523932099342346, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.292253702878952, "step": 2004 }, { "epoch": 0.125375, "grad_norm": 2.921875, "grad_norm_var": 0.029662068684895834, "learning_rate": 0.0001, "loss": 8.4154, "loss/crossentropy": 2.3884671926498413, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.28686901926994324, "step": 2006 }, { "epoch": 0.1255, "grad_norm": 3.34375, "grad_norm_var": 0.032633463541666664, "learning_rate": 0.0001, "loss": 8.4873, "loss/crossentropy": 2.492961883544922, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2938031554222107, "step": 2008 }, { "epoch": 0.125625, "grad_norm": 3.46875, "grad_norm_var": 0.04558919270833333, "learning_rate": 0.0001, "loss": 8.4653, "loss/crossentropy": 2.1341389417648315, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2501054108142853, "step": 2010 }, { "epoch": 0.12575, "grad_norm": 3.390625, "grad_norm_var": 0.043919881184895836, "learning_rate": 0.0001, "loss": 8.6647, "loss/crossentropy": 2.3359371423721313, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2804300785064697, "step": 2012 }, { "epoch": 0.125875, "grad_norm": 3.296875, "grad_norm_var": 0.04475809733072917, "learning_rate": 0.0001, "loss": 8.6907, "loss/crossentropy": 2.5302098989486694, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28145162761211395, "step": 2014 }, { "epoch": 0.126, "grad_norm": 3.234375, "grad_norm_var": 0.04456278483072917, "learning_rate": 0.0001, "loss": 8.6119, "loss/crossentropy": 2.437941551208496, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26573941111564636, "step": 2016 }, { "epoch": 0.126125, "grad_norm": 3.40625, "grad_norm_var": 0.035676066080729166, "learning_rate": 0.0001, "loss": 8.6258, "loss/crossentropy": 2.6573965549468994, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.29143331944942474, "step": 2018 }, { "epoch": 0.12625, "grad_norm": 3.34375, "grad_norm_var": 0.03544921875, "learning_rate": 0.0001, "loss": 8.4249, "loss/crossentropy": 2.4629688262939453, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28354427218437195, "step": 2020 }, { "epoch": 0.126375, "grad_norm": 4.21875, "grad_norm_var": 0.0703277587890625, "learning_rate": 0.0001, "loss": 8.8307, "loss/crossentropy": 2.252517580986023, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.3386313170194626, "step": 2022 }, { "epoch": 0.1265, "grad_norm": 3.34375, "grad_norm_var": 0.06189676920572917, "learning_rate": 0.0001, "loss": 8.2863, "loss/crossentropy": 2.348217487335205, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24487578123807907, "step": 2024 }, { "epoch": 0.126625, "grad_norm": 3.234375, "grad_norm_var": 0.057779947916666664, "learning_rate": 0.0001, "loss": 8.3008, "loss/crossentropy": 2.2095032930374146, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2705332636833191, "step": 2026 }, { "epoch": 0.12675, "grad_norm": 3.15625, "grad_norm_var": 0.06265869140625, "learning_rate": 0.0001, "loss": 8.4107, "loss/crossentropy": 2.167417824268341, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27016985416412354, "step": 2028 }, { "epoch": 0.126875, "grad_norm": 3.234375, "grad_norm_var": 0.06298726399739583, "learning_rate": 0.0001, "loss": 8.4653, "loss/crossentropy": 2.316556692123413, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2680739611387253, "step": 2030 }, { "epoch": 0.127, "grad_norm": 3.0625, "grad_norm_var": 0.06646728515625, "learning_rate": 0.0001, "loss": 8.4679, "loss/crossentropy": 2.220218300819397, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2792097181081772, "step": 2032 }, { "epoch": 0.127125, "grad_norm": 3.3125, "grad_norm_var": 0.06718343098958333, "learning_rate": 0.0001, "loss": 8.4241, "loss/crossentropy": 2.2215596437454224, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2561402767896652, "step": 2034 }, { "epoch": 0.12725, "grad_norm": 3.109375, "grad_norm_var": 0.0806060791015625, "learning_rate": 0.0001, "loss": 8.1138, "loss/crossentropy": 2.2149853706359863, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.26566116511821747, "step": 2036 }, { "epoch": 0.127375, "grad_norm": 3.15625, "grad_norm_var": 0.020148722330729167, "learning_rate": 0.0001, "loss": 8.4906, "loss/crossentropy": 2.054360866546631, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.31411731243133545, "step": 2038 }, { "epoch": 0.1275, "grad_norm": 3.71875, "grad_norm_var": 0.04202473958333333, "learning_rate": 0.0001, "loss": 8.3689, "loss/crossentropy": 2.3898195028305054, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.25973349064588547, "step": 2040 }, { "epoch": 0.127625, "grad_norm": 3.171875, "grad_norm_var": 0.04273173014322917, "learning_rate": 0.0001, "loss": 8.4614, "loss/crossentropy": 2.2964980602264404, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2612931877374649, "step": 2042 }, { "epoch": 0.12775, "grad_norm": 3.1875, "grad_norm_var": 0.0513092041015625, "learning_rate": 0.0001, "loss": 8.434, "loss/crossentropy": 2.58343243598938, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26787108182907104, "step": 2044 }, { "epoch": 0.127875, "grad_norm": 3.25, "grad_norm_var": 0.046708170572916666, "learning_rate": 0.0001, "loss": 8.4918, "loss/crossentropy": 2.24523389339447, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2874600887298584, "step": 2046 }, { "epoch": 0.128, "grad_norm": 3.453125, "grad_norm_var": 0.04885660807291667, "learning_rate": 0.0001, "loss": 8.6684, "loss/crossentropy": 2.2914642095565796, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.3138655722141266, "step": 2048 }, { "epoch": 0.128125, "grad_norm": 3.25, "grad_norm_var": 0.0490386962890625, "learning_rate": 0.0001, "loss": 8.5455, "loss/crossentropy": 2.5149351358413696, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.29720842838287354, "step": 2050 }, { "epoch": 0.12825, "grad_norm": 3.25, "grad_norm_var": 0.04035542805989583, "learning_rate": 0.0001, "loss": 8.3206, "loss/crossentropy": 2.1453710794448853, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.23647580295801163, "step": 2052 }, { "epoch": 0.128375, "grad_norm": 3.09375, "grad_norm_var": 0.053831990559895834, "learning_rate": 0.0001, "loss": 8.2585, "loss/crossentropy": 2.246112108230591, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26230376958847046, "step": 2054 }, { "epoch": 0.1285, "grad_norm": 3.203125, "grad_norm_var": 0.028238932291666668, "learning_rate": 0.0001, "loss": 8.3248, "loss/crossentropy": 2.412488341331482, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26848049461841583, "step": 2056 }, { "epoch": 0.128625, "grad_norm": 3.09375, "grad_norm_var": 0.027228800455729167, "learning_rate": 0.0001, "loss": 8.6028, "loss/crossentropy": 2.4104580879211426, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27841897308826447, "step": 2058 }, { "epoch": 0.12875, "grad_norm": 3.109375, "grad_norm_var": 0.025007120768229165, "learning_rate": 0.0001, "loss": 8.6475, "loss/crossentropy": 2.443092107772827, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27504517138004303, "step": 2060 }, { "epoch": 0.128875, "grad_norm": 3.234375, "grad_norm_var": 0.025812784830729168, "learning_rate": 0.0001, "loss": 8.5933, "loss/crossentropy": 2.401219129562378, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.29284021258354187, "step": 2062 }, { "epoch": 0.129, "grad_norm": 3.125, "grad_norm_var": 0.031208292643229166, "learning_rate": 0.0001, "loss": 8.4635, "loss/crossentropy": 2.2347441911697388, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.27816687524318695, "step": 2064 }, { "epoch": 0.129125, "grad_norm": 3.375, "grad_norm_var": 0.03675028483072917, "learning_rate": 0.0001, "loss": 8.4876, "loss/crossentropy": 2.444359302520752, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28021493554115295, "step": 2066 }, { "epoch": 0.12925, "grad_norm": 3.25, "grad_norm_var": 0.03655192057291667, "learning_rate": 0.0001, "loss": 8.4261, "loss/crossentropy": 2.32525098323822, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2960277646780014, "step": 2068 }, { "epoch": 0.129375, "grad_norm": 3.296875, "grad_norm_var": 0.023323567708333333, "learning_rate": 0.0001, "loss": 8.4241, "loss/crossentropy": 2.489911675453186, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2791932672262192, "step": 2070 }, { "epoch": 0.1295, "grad_norm": 3.53125, "grad_norm_var": 0.026981608072916666, "learning_rate": 0.0001, "loss": 8.5868, "loss/crossentropy": 2.3564376831054688, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2700600177049637, "step": 2072 }, { "epoch": 0.129625, "grad_norm": 3.125, "grad_norm_var": 0.026009114583333333, "learning_rate": 0.0001, "loss": 8.4257, "loss/crossentropy": 2.5643441677093506, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26559390127658844, "step": 2074 }, { "epoch": 0.12975, "grad_norm": 3.375, "grad_norm_var": 0.02750244140625, "learning_rate": 0.0001, "loss": 8.5631, "loss/crossentropy": 2.390055775642395, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2720278650522232, "step": 2076 }, { "epoch": 0.129875, "grad_norm": 3.09375, "grad_norm_var": 0.026610310872395834, "learning_rate": 0.0001, "loss": 8.1939, "loss/crossentropy": 2.241236686706543, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2665587216615677, "step": 2078 }, { "epoch": 0.13, "grad_norm": 3.09375, "grad_norm_var": 0.021516927083333335, "learning_rate": 0.0001, "loss": 8.3988, "loss/crossentropy": 2.282191276550293, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.26526105403900146, "step": 2080 }, { "epoch": 0.130125, "grad_norm": 3.484375, "grad_norm_var": 0.021773274739583334, "learning_rate": 0.0001, "loss": 8.5927, "loss/crossentropy": 2.325153946876526, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2865811884403229, "step": 2082 }, { "epoch": 0.13025, "grad_norm": 3.265625, "grad_norm_var": 0.021870930989583332, "learning_rate": 0.0001, "loss": 8.3854, "loss/crossentropy": 2.3805923461914062, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26753927767276764, "step": 2084 }, { "epoch": 0.130375, "grad_norm": 3.453125, "grad_norm_var": 0.030301920572916665, "learning_rate": 0.0001, "loss": 8.4407, "loss/crossentropy": 2.314916491508484, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27673472464084625, "step": 2086 }, { "epoch": 0.1305, "grad_norm": 3.234375, "grad_norm_var": 0.025699869791666666, "learning_rate": 0.0001, "loss": 8.4033, "loss/crossentropy": 2.416364073753357, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.27838364243507385, "step": 2088 }, { "epoch": 0.130625, "grad_norm": 3.421875, "grad_norm_var": 0.027074178059895832, "learning_rate": 0.0001, "loss": 8.3806, "loss/crossentropy": 2.4156532287597656, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2752366364002228, "step": 2090 }, { "epoch": 0.13075, "grad_norm": 3.375, "grad_norm_var": 0.025288899739583332, "learning_rate": 0.0001, "loss": 8.6506, "loss/crossentropy": 2.5105860233306885, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28529515862464905, "step": 2092 }, { "epoch": 0.130875, "grad_norm": 3.296875, "grad_norm_var": 0.020458984375, "learning_rate": 0.0001, "loss": 8.613, "loss/crossentropy": 2.4152865409851074, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.304058313369751, "step": 2094 }, { "epoch": 0.131, "grad_norm": 3.28125, "grad_norm_var": 0.0170318603515625, "learning_rate": 0.0001, "loss": 8.5973, "loss/crossentropy": 2.3700071573257446, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2684682756662369, "step": 2096 }, { "epoch": 0.131125, "grad_norm": 3.171875, "grad_norm_var": 0.015120442708333333, "learning_rate": 0.0001, "loss": 8.4983, "loss/crossentropy": 2.305688500404358, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.28055354952812195, "step": 2098 }, { "epoch": 0.13125, "grad_norm": 3.09375, "grad_norm_var": 0.018830362955729166, "learning_rate": 0.0001, "loss": 8.3541, "loss/crossentropy": 2.0869110226631165, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2802015244960785, "step": 2100 }, { "epoch": 0.131375, "grad_norm": 3.109375, "grad_norm_var": 0.015999348958333333, "learning_rate": 0.0001, "loss": 8.2526, "loss/crossentropy": 2.242175340652466, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2823330760002136, "step": 2102 }, { "epoch": 0.1315, "grad_norm": 3.53125, "grad_norm_var": 0.020384724934895834, "learning_rate": 0.0001, "loss": 8.2892, "loss/crossentropy": 2.3229693174362183, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2636559307575226, "step": 2104 }, { "epoch": 0.131625, "grad_norm": 3.34375, "grad_norm_var": 0.019831339518229168, "learning_rate": 0.0001, "loss": 8.4317, "loss/crossentropy": 2.5321102142333984, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2901579737663269, "step": 2106 }, { "epoch": 0.13175, "grad_norm": 3.265625, "grad_norm_var": 0.01666259765625, "learning_rate": 0.0001, "loss": 8.4445, "loss/crossentropy": 2.397943615913391, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2635400742292404, "step": 2108 }, { "epoch": 0.131875, "grad_norm": 3.546875, "grad_norm_var": 0.021654256184895835, "learning_rate": 0.0001, "loss": 8.4828, "loss/crossentropy": 2.489462733268738, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2792319655418396, "step": 2110 }, { "epoch": 0.132, "grad_norm": 3.375, "grad_norm_var": 0.022289021809895834, "learning_rate": 0.0001, "loss": 8.5521, "loss/crossentropy": 2.3110828399658203, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.30214357376098633, "step": 2112 }, { "epoch": 0.132125, "grad_norm": 3.375, "grad_norm_var": 0.022037760416666666, "learning_rate": 0.0001, "loss": 8.5841, "loss/crossentropy": 2.563341736793518, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2941661477088928, "step": 2114 }, { "epoch": 0.13225, "grad_norm": 3.078125, "grad_norm_var": 0.022459920247395834, "learning_rate": 0.0001, "loss": 8.3364, "loss/crossentropy": 2.4198527336120605, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2666160762310028, "step": 2116 }, { "epoch": 0.132375, "grad_norm": 3.09375, "grad_norm_var": 0.020563761393229168, "learning_rate": 0.0001, "loss": 8.2778, "loss/crossentropy": 2.285884737968445, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27266763150691986, "step": 2118 }, { "epoch": 0.1325, "grad_norm": 3.25, "grad_norm_var": 0.016988118489583332, "learning_rate": 0.0001, "loss": 8.5911, "loss/crossentropy": 2.3701746463775635, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.29898887872695923, "step": 2120 }, { "epoch": 0.132625, "grad_norm": 3.296875, "grad_norm_var": 0.015062459309895833, "learning_rate": 0.0001, "loss": 8.658, "loss/crossentropy": 2.4369832277297974, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27174198627471924, "step": 2122 }, { "epoch": 0.13275, "grad_norm": 3.578125, "grad_norm_var": 0.022785441080729166, "learning_rate": 0.0001, "loss": 8.5709, "loss/crossentropy": 2.3035892248153687, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2923412248492241, "step": 2124 }, { "epoch": 0.132875, "grad_norm": 3.0625, "grad_norm_var": 0.02310791015625, "learning_rate": 0.0001, "loss": 8.1847, "loss/crossentropy": 2.384789824485779, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27344001829624176, "step": 2126 }, { "epoch": 0.133, "grad_norm": 3.0625, "grad_norm_var": 0.024242146809895834, "learning_rate": 0.0001, "loss": 8.306, "loss/crossentropy": 2.2693413496017456, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2933850884437561, "step": 2128 }, { "epoch": 0.133125, "grad_norm": 3.265625, "grad_norm_var": 0.023193359375, "learning_rate": 0.0001, "loss": 8.5839, "loss/crossentropy": 2.287759780883789, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2801215946674347, "step": 2130 }, { "epoch": 0.13325, "grad_norm": 3.34375, "grad_norm_var": 0.021825154622395832, "learning_rate": 0.0001, "loss": 8.7328, "loss/crossentropy": 2.274720072746277, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.31832873821258545, "step": 2132 }, { "epoch": 0.133375, "grad_norm": 3.265625, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 8.4415, "loss/crossentropy": 2.5326437950134277, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2927217036485672, "step": 2134 }, { "epoch": 0.1335, "grad_norm": 3.546875, "grad_norm_var": 0.025422159830729166, "learning_rate": 0.0001, "loss": 8.4062, "loss/crossentropy": 2.4775502681732178, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28668080270290375, "step": 2136 }, { "epoch": 0.133625, "grad_norm": 2.96875, "grad_norm_var": 0.03284505208333333, "learning_rate": 0.0001, "loss": 8.4527, "loss/crossentropy": 2.2728978395462036, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2754566967487335, "step": 2138 }, { "epoch": 0.13375, "grad_norm": 3.140625, "grad_norm_var": 0.024095662434895835, "learning_rate": 0.0001, "loss": 8.6044, "loss/crossentropy": 2.3857744932174683, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2717433273792267, "step": 2140 }, { "epoch": 0.133875, "grad_norm": 3.3125, "grad_norm_var": 0.022175089518229166, "learning_rate": 0.0001, "loss": 8.5458, "loss/crossentropy": 2.1756142377853394, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27554982900619507, "step": 2142 }, { "epoch": 0.134, "grad_norm": 3.078125, "grad_norm_var": 0.021971638997395834, "learning_rate": 0.0001, "loss": 8.3312, "loss/crossentropy": 2.3074915409088135, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2622702121734619, "step": 2144 }, { "epoch": 0.134125, "grad_norm": 3.03125, "grad_norm_var": 0.0297271728515625, "learning_rate": 0.0001, "loss": 8.2721, "loss/crossentropy": 2.320749521255493, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28162428736686707, "step": 2146 }, { "epoch": 0.13425, "grad_norm": 3.046875, "grad_norm_var": 0.031061808268229168, "learning_rate": 0.0001, "loss": 8.3269, "loss/crossentropy": 2.009217321872711, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2450430542230606, "step": 2148 }, { "epoch": 0.134375, "grad_norm": 3.3125, "grad_norm_var": 0.031037394205729166, "learning_rate": 0.0001, "loss": 8.4999, "loss/crossentropy": 2.278393268585205, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2750803083181381, "step": 2150 }, { "epoch": 0.1345, "grad_norm": 3.265625, "grad_norm_var": 0.02255859375, "learning_rate": 0.0001, "loss": 8.4054, "loss/crossentropy": 2.2876728773117065, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.281391441822052, "step": 2152 }, { "epoch": 0.134625, "grad_norm": 3.296875, "grad_norm_var": 0.016682942708333332, "learning_rate": 0.0001, "loss": 8.2786, "loss/crossentropy": 2.163739323616028, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2533603757619858, "step": 2154 }, { "epoch": 0.13475, "grad_norm": 3.0625, "grad_norm_var": 0.0157867431640625, "learning_rate": 0.0001, "loss": 8.3023, "loss/crossentropy": 2.3555957078933716, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.3196130394935608, "step": 2156 }, { "epoch": 0.134875, "grad_norm": 3.140625, "grad_norm_var": 0.0141754150390625, "learning_rate": 0.0001, "loss": 8.4222, "loss/crossentropy": 2.389632821083069, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2907683253288269, "step": 2158 }, { "epoch": 0.135, "grad_norm": 2.9375, "grad_norm_var": 0.015576171875, "learning_rate": 0.0001, "loss": 8.2051, "loss/crossentropy": 2.140980839729309, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.274806946516037, "step": 2160 }, { "epoch": 0.135125, "grad_norm": 3.375, "grad_norm_var": 0.016227213541666667, "learning_rate": 0.0001, "loss": 8.4341, "loss/crossentropy": 2.298740863800049, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30332574248313904, "step": 2162 }, { "epoch": 0.13525, "grad_norm": 3.390625, "grad_norm_var": 0.0182281494140625, "learning_rate": 0.0001, "loss": 8.267, "loss/crossentropy": 2.3586976528167725, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2925044894218445, "step": 2164 }, { "epoch": 0.135375, "grad_norm": 3.09375, "grad_norm_var": 0.014676920572916667, "learning_rate": 0.0001, "loss": 8.4944, "loss/crossentropy": 2.4937496185302734, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.28084906935691833, "step": 2166 }, { "epoch": 0.1355, "grad_norm": 3.21875, "grad_norm_var": 0.014778645833333333, "learning_rate": 0.0001, "loss": 8.3792, "loss/crossentropy": 2.2324042320251465, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26973237097263336, "step": 2168 }, { "epoch": 0.135625, "grad_norm": 3.359375, "grad_norm_var": 0.01695556640625, "learning_rate": 0.0001, "loss": 8.3303, "loss/crossentropy": 2.3384079933166504, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2514626085758209, "step": 2170 }, { "epoch": 0.13575, "grad_norm": 3.109375, "grad_norm_var": 0.01636962890625, "learning_rate": 0.0001, "loss": 8.2997, "loss/crossentropy": 2.2900805473327637, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.287667915225029, "step": 2172 }, { "epoch": 0.135875, "grad_norm": 4.4375, "grad_norm_var": 0.113818359375, "learning_rate": 0.0001, "loss": 8.6838, "loss/crossentropy": 2.3386855125427246, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.28304457664489746, "step": 2174 }, { "epoch": 0.136, "grad_norm": 3.328125, "grad_norm_var": 0.10152587890625, "learning_rate": 0.0001, "loss": 8.6361, "loss/crossentropy": 2.4590978622436523, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.3032967746257782, "step": 2176 }, { "epoch": 0.136125, "grad_norm": 3.640625, "grad_norm_var": 0.10586649576822917, "learning_rate": 0.0001, "loss": 8.524, "loss/crossentropy": 2.4377013444900513, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2948048859834671, "step": 2178 }, { "epoch": 0.13625, "grad_norm": 3.5, "grad_norm_var": 0.11148173014322917, "learning_rate": 0.0001, "loss": 8.427, "loss/crossentropy": 2.3763121366500854, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2737642973661423, "step": 2180 }, { "epoch": 0.136375, "grad_norm": 3.46875, "grad_norm_var": 0.10756734212239584, "learning_rate": 0.0001, "loss": 8.7061, "loss/crossentropy": 2.3559194803237915, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.32232511043548584, "step": 2182 }, { "epoch": 0.1365, "grad_norm": 3.21875, "grad_norm_var": 0.10926106770833334, "learning_rate": 0.0001, "loss": 8.3743, "loss/crossentropy": 2.3787566423416138, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.266545370221138, "step": 2184 }, { "epoch": 0.136625, "grad_norm": 3.03125, "grad_norm_var": 0.1169830322265625, "learning_rate": 0.0001, "loss": 8.5617, "loss/crossentropy": 2.4981011152267456, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.30071987211704254, "step": 2186 }, { "epoch": 0.13675, "grad_norm": 3.71875, "grad_norm_var": 0.91328125, "learning_rate": 0.0001, "loss": 8.6439, "loss/crossentropy": 2.4121392965316772, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28945305943489075, "step": 2188 }, { "epoch": 0.136875, "grad_norm": 3.34375, "grad_norm_var": 0.8569498697916667, "learning_rate": 0.0001, "loss": 8.6653, "loss/crossentropy": 2.5263431072235107, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.3138289451599121, "step": 2190 }, { "epoch": 0.137, "grad_norm": 3.421875, "grad_norm_var": 0.8448404947916667, "learning_rate": 0.0001, "loss": 8.6425, "loss/crossentropy": 2.3461450338363647, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2797774076461792, "step": 2192 }, { "epoch": 0.137125, "grad_norm": 3.125, "grad_norm_var": 0.8635813395182291, "learning_rate": 0.0001, "loss": 8.7482, "loss/crossentropy": 2.562302827835083, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2883787453174591, "step": 2194 }, { "epoch": 0.13725, "grad_norm": 3.671875, "grad_norm_var": 0.8723917643229167, "learning_rate": 0.0001, "loss": 8.5541, "loss/crossentropy": 2.392000913619995, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28321924805641174, "step": 2196 }, { "epoch": 0.137375, "grad_norm": 3.09375, "grad_norm_var": 0.8815592447916667, "learning_rate": 0.0001, "loss": 8.2531, "loss/crossentropy": 2.250584840774536, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2665387690067291, "step": 2198 }, { "epoch": 0.1375, "grad_norm": 3.484375, "grad_norm_var": 0.8637003580729167, "learning_rate": 0.0001, "loss": 8.2037, "loss/crossentropy": 2.2912397384643555, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2549472153186798, "step": 2200 }, { "epoch": 0.137625, "grad_norm": 3.40625, "grad_norm_var": 0.8500803629557292, "learning_rate": 0.0001, "loss": 8.3596, "loss/crossentropy": 2.470545172691345, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.29853616654872894, "step": 2202 }, { "epoch": 0.13775, "grad_norm": 3.140625, "grad_norm_var": 0.0280670166015625, "learning_rate": 0.0001, "loss": 8.4787, "loss/crossentropy": 2.1322121024131775, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2622481659054756, "step": 2204 }, { "epoch": 0.137875, "grad_norm": 3.28125, "grad_norm_var": 0.026740519205729167, "learning_rate": 0.0001, "loss": 8.466, "loss/crossentropy": 2.340916156768799, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2863175719976425, "step": 2206 }, { "epoch": 0.138, "grad_norm": 3.125, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 8.4187, "loss/crossentropy": 2.397621750831604, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2743557095527649, "step": 2208 }, { "epoch": 0.138125, "grad_norm": 3.0, "grad_norm_var": 0.03684794108072917, "learning_rate": 0.0001, "loss": 8.3557, "loss/crossentropy": 2.2214730978012085, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.3140524923801422, "step": 2210 }, { "epoch": 0.13825, "grad_norm": 3.515625, "grad_norm_var": 0.029759724934895832, "learning_rate": 0.0001, "loss": 8.4706, "loss/crossentropy": 2.3356775045394897, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28506164252758026, "step": 2212 }, { "epoch": 0.138375, "grad_norm": 3.09375, "grad_norm_var": 0.030061848958333335, "learning_rate": 0.0001, "loss": 8.5412, "loss/crossentropy": 2.2626901865005493, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2693404108285904, "step": 2214 }, { "epoch": 0.1385, "grad_norm": 3.171875, "grad_norm_var": 0.0215728759765625, "learning_rate": 0.0001, "loss": 8.3191, "loss/crossentropy": 2.1286741495132446, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2530980408191681, "step": 2216 }, { "epoch": 0.138625, "grad_norm": 3.21875, "grad_norm_var": 0.021565755208333332, "learning_rate": 0.0001, "loss": 8.6178, "loss/crossentropy": 2.3509132862091064, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2691505402326584, "step": 2218 }, { "epoch": 0.13875, "grad_norm": 3.46875, "grad_norm_var": 0.0286529541015625, "learning_rate": 0.0001, "loss": 8.6084, "loss/crossentropy": 2.3187073469161987, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2853757441043854, "step": 2220 }, { "epoch": 0.138875, "grad_norm": 3.171875, "grad_norm_var": 0.030882771809895834, "learning_rate": 0.0001, "loss": 8.5621, "loss/crossentropy": 2.4514535665512085, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2782391607761383, "step": 2222 }, { "epoch": 0.139, "grad_norm": 3.25, "grad_norm_var": 0.0228912353515625, "learning_rate": 0.0001, "loss": 8.3712, "loss/crossentropy": 2.509943962097168, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2838071286678314, "step": 2224 }, { "epoch": 0.139125, "grad_norm": 3.234375, "grad_norm_var": 0.01640625, "learning_rate": 0.0001, "loss": 8.5852, "loss/crossentropy": 2.2804245948791504, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28991344571113586, "step": 2226 }, { "epoch": 0.13925, "grad_norm": 3.796875, "grad_norm_var": 0.0291412353515625, "learning_rate": 0.0001, "loss": 8.7108, "loss/crossentropy": 2.453079104423523, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2956879884004593, "step": 2228 }, { "epoch": 0.139375, "grad_norm": 3.296875, "grad_norm_var": 0.02652587890625, "learning_rate": 0.0001, "loss": 8.6285, "loss/crossentropy": 2.4368172883987427, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.3047195374965668, "step": 2230 }, { "epoch": 0.1395, "grad_norm": 3.140625, "grad_norm_var": 0.0262359619140625, "learning_rate": 0.0001, "loss": 8.4539, "loss/crossentropy": 2.306997299194336, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.251560240983963, "step": 2232 }, { "epoch": 0.139625, "grad_norm": 3.03125, "grad_norm_var": 0.042041015625, "learning_rate": 0.0001, "loss": 8.424, "loss/crossentropy": 2.139360785484314, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28243619203567505, "step": 2234 }, { "epoch": 0.13975, "grad_norm": 3.171875, "grad_norm_var": 0.03759358723958333, "learning_rate": 0.0001, "loss": 8.2889, "loss/crossentropy": 2.2428722381591797, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2649083435535431, "step": 2236 }, { "epoch": 0.139875, "grad_norm": 3.171875, "grad_norm_var": 0.03997395833333333, "learning_rate": 0.0001, "loss": 8.3299, "loss/crossentropy": 2.3508530855178833, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2734558805823326, "step": 2238 }, { "epoch": 0.14, "grad_norm": 3.15625, "grad_norm_var": 0.042267862955729166, "learning_rate": 0.0001, "loss": 8.1833, "loss/crossentropy": 2.3940563201904297, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26705045998096466, "step": 2240 }, { "epoch": 0.140125, "grad_norm": 3.359375, "grad_norm_var": 0.0432037353515625, "learning_rate": 0.0001, "loss": 8.4273, "loss/crossentropy": 2.294648766517639, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2624429166316986, "step": 2242 }, { "epoch": 0.14025, "grad_norm": 3.21875, "grad_norm_var": 0.023824055989583332, "learning_rate": 0.0001, "loss": 8.4463, "loss/crossentropy": 2.303459644317627, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28779207170009613, "step": 2244 }, { "epoch": 0.140375, "grad_norm": 3.109375, "grad_norm_var": 0.022484334309895833, "learning_rate": 0.0001, "loss": 8.5258, "loss/crossentropy": 2.452125668525696, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.281644269824028, "step": 2246 }, { "epoch": 0.1405, "grad_norm": 3.46875, "grad_norm_var": 0.028857421875, "learning_rate": 0.0001, "loss": 8.4929, "loss/crossentropy": 2.3145229816436768, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2791597843170166, "step": 2248 }, { "epoch": 0.140625, "grad_norm": 3.40625, "grad_norm_var": 0.10737202962239584, "learning_rate": 0.0001, "loss": 8.3374, "loss/crossentropy": 2.219459652900696, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2960502803325653, "step": 2250 }, { "epoch": 0.14075, "grad_norm": 3.296875, "grad_norm_var": 0.10914306640625, "learning_rate": 0.0001, "loss": 8.4506, "loss/crossentropy": 2.0157440304756165, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2858085185289383, "step": 2252 }, { "epoch": 0.140875, "grad_norm": 3.375, "grad_norm_var": 0.10522359212239583, "learning_rate": 0.0001, "loss": 8.6443, "loss/crossentropy": 2.3050343990325928, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2768677771091461, "step": 2254 }, { "epoch": 0.141, "grad_norm": 2.875, "grad_norm_var": 0.12316792805989583, "learning_rate": 0.0001, "loss": 8.1788, "loss/crossentropy": 2.2533038854599, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24813885986804962, "step": 2256 }, { "epoch": 0.141125, "grad_norm": 3.3125, "grad_norm_var": 0.12766927083333332, "learning_rate": 0.0001, "loss": 8.3708, "loss/crossentropy": 2.4170485734939575, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2693845331668854, "step": 2258 }, { "epoch": 0.14125, "grad_norm": 3.0625, "grad_norm_var": 0.13147684733072917, "learning_rate": 0.0001, "loss": 8.4648, "loss/crossentropy": 2.294472575187683, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2734464854001999, "step": 2260 }, { "epoch": 0.141375, "grad_norm": 3.203125, "grad_norm_var": 0.12929585774739583, "learning_rate": 0.0001, "loss": 8.1808, "loss/crossentropy": 2.1154235005378723, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28365112841129303, "step": 2262 }, { "epoch": 0.1415, "grad_norm": 3.046875, "grad_norm_var": 0.1328125, "learning_rate": 0.0001, "loss": 8.1941, "loss/crossentropy": 2.1333194971084595, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.32095377147197723, "step": 2264 }, { "epoch": 0.141625, "grad_norm": 3.375, "grad_norm_var": 0.02467041015625, "learning_rate": 0.0001, "loss": 8.4459, "loss/crossentropy": 2.3149880170822144, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2762584388256073, "step": 2266 }, { "epoch": 0.14175, "grad_norm": 2.9375, "grad_norm_var": 0.028120930989583334, "learning_rate": 0.0001, "loss": 8.422, "loss/crossentropy": 2.6591659784317017, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29070281982421875, "step": 2268 }, { "epoch": 0.141875, "grad_norm": 3.359375, "grad_norm_var": 0.026102701822916668, "learning_rate": 0.0001, "loss": 8.2709, "loss/crossentropy": 2.3531733751296997, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.24519621580839157, "step": 2270 }, { "epoch": 0.142, "grad_norm": 3.09375, "grad_norm_var": 0.05454813639322917, "learning_rate": 0.0001, "loss": 8.2338, "loss/crossentropy": 2.1661760807037354, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24747133255004883, "step": 2272 }, { "epoch": 0.142125, "grad_norm": 3.1875, "grad_norm_var": 0.0540191650390625, "learning_rate": 0.0001, "loss": 8.5048, "loss/crossentropy": 2.2301371097564697, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.269399493932724, "step": 2274 }, { "epoch": 0.14225, "grad_norm": 3.640625, "grad_norm_var": 0.06352437337239583, "learning_rate": 0.0001, "loss": 8.5953, "loss/crossentropy": 2.65217924118042, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.288607120513916, "step": 2276 }, { "epoch": 0.142375, "grad_norm": 2.9375, "grad_norm_var": 0.07203369140625, "learning_rate": 0.0001, "loss": 8.0698, "loss/crossentropy": 2.4561513662338257, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2938566952943802, "step": 2278 }, { "epoch": 0.1425, "grad_norm": 3.421875, "grad_norm_var": 0.0697174072265625, "learning_rate": 0.0001, "loss": 8.3514, "loss/crossentropy": 2.517719268798828, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2574344277381897, "step": 2280 }, { "epoch": 0.142625, "grad_norm": 3.078125, "grad_norm_var": 0.07379150390625, "learning_rate": 0.0001, "loss": 8.404, "loss/crossentropy": 2.507733106613159, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28523363173007965, "step": 2282 }, { "epoch": 0.14275, "grad_norm": 3.078125, "grad_norm_var": 0.0678863525390625, "learning_rate": 0.0001, "loss": 8.3211, "loss/crossentropy": 2.248465895652771, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27300480008125305, "step": 2284 }, { "epoch": 0.142875, "grad_norm": 3.40625, "grad_norm_var": 0.06708577473958334, "learning_rate": 0.0001, "loss": 8.4819, "loss/crossentropy": 2.175841212272644, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2846294641494751, "step": 2286 }, { "epoch": 0.143, "grad_norm": 3.328125, "grad_norm_var": 0.043257649739583334, "learning_rate": 0.0001, "loss": 8.5189, "loss/crossentropy": 2.53265118598938, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2701347917318344, "step": 2288 }, { "epoch": 0.143125, "grad_norm": 3.15625, "grad_norm_var": 0.0451812744140625, "learning_rate": 0.0001, "loss": 8.1393, "loss/crossentropy": 2.2115447521209717, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26285167038440704, "step": 2290 }, { "epoch": 0.14325, "grad_norm": 2.953125, "grad_norm_var": 0.038895670572916666, "learning_rate": 0.0001, "loss": 8.4895, "loss/crossentropy": 2.540266752243042, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.29030686616897583, "step": 2292 }, { "epoch": 0.143375, "grad_norm": 3.109375, "grad_norm_var": 0.0341705322265625, "learning_rate": 0.0001, "loss": 8.4026, "loss/crossentropy": 2.361076593399048, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26986297965049744, "step": 2294 }, { "epoch": 0.1435, "grad_norm": 3.109375, "grad_norm_var": 0.022297159830729166, "learning_rate": 0.0001, "loss": 8.4674, "loss/crossentropy": 2.3896981477737427, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2720335051417351, "step": 2296 }, { "epoch": 0.143625, "grad_norm": 3.171875, "grad_norm_var": 0.022001139322916665, "learning_rate": 0.0001, "loss": 8.1891, "loss/crossentropy": 2.1911017894744873, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26920412480831146, "step": 2298 }, { "epoch": 0.14375, "grad_norm": 3.1875, "grad_norm_var": 0.026090494791666665, "learning_rate": 0.0001, "loss": 8.3786, "loss/crossentropy": 2.282583713531494, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27442607283592224, "step": 2300 }, { "epoch": 0.143875, "grad_norm": 3.140625, "grad_norm_var": 0.015534464518229167, "learning_rate": 0.0001, "loss": 8.3697, "loss/crossentropy": 2.3890000581741333, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.29275405406951904, "step": 2302 }, { "epoch": 0.144, "grad_norm": 3.34375, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 8.6341, "loss/crossentropy": 2.5382707118988037, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2767443507909775, "step": 2304 }, { "epoch": 0.144125, "grad_norm": 3.0, "grad_norm_var": 0.027099609375, "learning_rate": 0.0001, "loss": 8.427, "loss/crossentropy": 2.423385739326477, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2653146833181381, "step": 2306 }, { "epoch": 0.14425, "grad_norm": 3.140625, "grad_norm_var": 0.023274739583333332, "learning_rate": 0.0001, "loss": 8.3893, "loss/crossentropy": 2.351103663444519, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2693053185939789, "step": 2308 }, { "epoch": 0.144375, "grad_norm": 2.78125, "grad_norm_var": 0.032957967122395834, "learning_rate": 0.0001, "loss": 8.1106, "loss/crossentropy": 2.161897659301758, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2585082948207855, "step": 2310 }, { "epoch": 0.1445, "grad_norm": 3.328125, "grad_norm_var": 0.034468587239583334, "learning_rate": 0.0001, "loss": 8.5501, "loss/crossentropy": 2.5974907875061035, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2823385000228882, "step": 2312 }, { "epoch": 0.144625, "grad_norm": 3.125, "grad_norm_var": 0.031636555989583336, "learning_rate": 0.0001, "loss": 8.1398, "loss/crossentropy": 2.2245877981185913, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2538044899702072, "step": 2314 }, { "epoch": 0.14475, "grad_norm": 3.140625, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 8.334, "loss/crossentropy": 2.378359794616699, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27289170026779175, "step": 2316 }, { "epoch": 0.144875, "grad_norm": 3.140625, "grad_norm_var": 0.031180826822916667, "learning_rate": 0.0001, "loss": 8.32, "loss/crossentropy": 2.2858855724334717, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2700234651565552, "step": 2318 }, { "epoch": 0.145, "grad_norm": 2.96875, "grad_norm_var": 0.016901652018229168, "learning_rate": 0.0001, "loss": 8.3103, "loss/crossentropy": 2.4376027584075928, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26375965774059296, "step": 2320 }, { "epoch": 0.145125, "grad_norm": 3.265625, "grad_norm_var": 0.019580078125, "learning_rate": 0.0001, "loss": 8.2817, "loss/crossentropy": 2.491786479949951, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27208443731069565, "step": 2322 }, { "epoch": 0.14525, "grad_norm": 3.34375, "grad_norm_var": 0.022907511393229166, "learning_rate": 0.0001, "loss": 8.3947, "loss/crossentropy": 2.1058656573295593, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2547650635242462, "step": 2324 }, { "epoch": 0.145375, "grad_norm": 2.984375, "grad_norm_var": 0.017171223958333332, "learning_rate": 0.0001, "loss": 8.2597, "loss/crossentropy": 2.208884119987488, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27819526195526123, "step": 2326 }, { "epoch": 0.1455, "grad_norm": 3.765625, "grad_norm_var": 0.04153645833333333, "learning_rate": 0.0001, "loss": 8.6312, "loss/crossentropy": 2.467799663543701, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26558516919612885, "step": 2328 }, { "epoch": 0.145625, "grad_norm": 3.109375, "grad_norm_var": 0.04265034993489583, "learning_rate": 0.0001, "loss": 8.4207, "loss/crossentropy": 2.418417453765869, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2655976563692093, "step": 2330 }, { "epoch": 0.14575, "grad_norm": 3.203125, "grad_norm_var": 0.040848795572916666, "learning_rate": 0.0001, "loss": 8.3613, "loss/crossentropy": 2.321434497833252, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.30776557326316833, "step": 2332 }, { "epoch": 0.145875, "grad_norm": 3.25, "grad_norm_var": 0.04389546712239583, "learning_rate": 0.0001, "loss": 8.5335, "loss/crossentropy": 2.321682333946228, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2719985991716385, "step": 2334 }, { "epoch": 0.146, "grad_norm": 2.875, "grad_norm_var": 0.04781901041666667, "learning_rate": 0.0001, "loss": 8.1955, "loss/crossentropy": 2.179586887359619, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.24687421321868896, "step": 2336 }, { "epoch": 0.146125, "grad_norm": 3.125, "grad_norm_var": 0.043863932291666664, "learning_rate": 0.0001, "loss": 8.3332, "loss/crossentropy": 2.24389386177063, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.29478612542152405, "step": 2338 }, { "epoch": 0.14625, "grad_norm": 3.15625, "grad_norm_var": 0.04262593587239583, "learning_rate": 0.0001, "loss": 8.5372, "loss/crossentropy": 2.5267962217330933, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2840902507305145, "step": 2340 }, { "epoch": 0.146375, "grad_norm": 3.078125, "grad_norm_var": 0.041792805989583334, "learning_rate": 0.0001, "loss": 8.3559, "loss/crossentropy": 2.2484867572784424, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26582688093185425, "step": 2342 }, { "epoch": 0.1465, "grad_norm": 3.34375, "grad_norm_var": 0.021675618489583333, "learning_rate": 0.0001, "loss": 8.3478, "loss/crossentropy": 2.3685790300369263, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27287431061267853, "step": 2344 }, { "epoch": 0.146625, "grad_norm": 3.140625, "grad_norm_var": 0.0241363525390625, "learning_rate": 0.0001, "loss": 8.625, "loss/crossentropy": 2.430737853050232, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2730434536933899, "step": 2346 }, { "epoch": 0.14675, "grad_norm": 2.890625, "grad_norm_var": 0.07626546223958333, "learning_rate": 0.0001, "loss": 8.3045, "loss/crossentropy": 2.4417322874069214, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26828059554100037, "step": 2348 }, { "epoch": 0.146875, "grad_norm": 3.90625, "grad_norm_var": 0.11562398274739584, "learning_rate": 0.0001, "loss": 8.4202, "loss/crossentropy": 2.2339513301849365, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2739114910364151, "step": 2350 }, { "epoch": 0.147, "grad_norm": 3.21875, "grad_norm_var": 0.10396219889322916, "learning_rate": 0.0001, "loss": 8.5676, "loss/crossentropy": 2.539394974708557, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2761157304048538, "step": 2352 }, { "epoch": 0.147125, "grad_norm": 3.25, "grad_norm_var": 0.10067952473958333, "learning_rate": 0.0001, "loss": 8.2937, "loss/crossentropy": 2.450527548789978, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26511844992637634, "step": 2354 }, { "epoch": 0.14725, "grad_norm": 3.0625, "grad_norm_var": 0.10345052083333334, "learning_rate": 0.0001, "loss": 8.1971, "loss/crossentropy": 2.2373549938201904, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26237648725509644, "step": 2356 }, { "epoch": 0.147375, "grad_norm": 3.59375, "grad_norm_var": 0.09674072265625, "learning_rate": 0.0001, "loss": 8.4327, "loss/crossentropy": 2.2837640047073364, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2683512270450592, "step": 2358 }, { "epoch": 0.1475, "grad_norm": 3.09375, "grad_norm_var": 0.10239156087239583, "learning_rate": 0.0001, "loss": 8.3287, "loss/crossentropy": 2.238037943840027, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2713322341442108, "step": 2360 }, { "epoch": 0.147625, "grad_norm": 3.0625, "grad_norm_var": 0.104736328125, "learning_rate": 0.0001, "loss": 8.4571, "loss/crossentropy": 2.5709011554718018, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27457040548324585, "step": 2362 }, { "epoch": 0.14775, "grad_norm": 3.1875, "grad_norm_var": 0.0671783447265625, "learning_rate": 0.0001, "loss": 8.4263, "loss/crossentropy": 2.3125263452529907, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.28667566180229187, "step": 2364 }, { "epoch": 0.147875, "grad_norm": 3.328125, "grad_norm_var": 0.025983683268229165, "learning_rate": 0.0001, "loss": 8.2756, "loss/crossentropy": 2.230543076992035, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2760595977306366, "step": 2366 }, { "epoch": 0.148, "grad_norm": 3.078125, "grad_norm_var": 0.021598307291666667, "learning_rate": 0.0001, "loss": 8.3417, "loss/crossentropy": 2.6067885160446167, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2718808054924011, "step": 2368 }, { "epoch": 0.148125, "grad_norm": 3.5, "grad_norm_var": 0.0322418212890625, "learning_rate": 0.0001, "loss": 8.4566, "loss/crossentropy": 2.4617605209350586, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2566886991262436, "step": 2370 }, { "epoch": 0.14825, "grad_norm": 2.9375, "grad_norm_var": 0.03632405598958333, "learning_rate": 0.0001, "loss": 8.3095, "loss/crossentropy": 2.545408248901367, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28086233139038086, "step": 2372 }, { "epoch": 0.148375, "grad_norm": 3.125, "grad_norm_var": 0.025862630208333334, "learning_rate": 0.0001, "loss": 8.4444, "loss/crossentropy": 2.310088276863098, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28762035071849823, "step": 2374 }, { "epoch": 0.1485, "grad_norm": 3.234375, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 8.4027, "loss/crossentropy": 2.306009292602539, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.251963771879673, "step": 2376 }, { "epoch": 0.148625, "grad_norm": 3.21875, "grad_norm_var": 0.023368326822916667, "learning_rate": 0.0001, "loss": 8.2834, "loss/crossentropy": 2.531379818916321, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2890370786190033, "step": 2378 }, { "epoch": 0.14875, "grad_norm": 3.09375, "grad_norm_var": 0.022652180989583333, "learning_rate": 0.0001, "loss": 8.5298, "loss/crossentropy": 2.4868820905685425, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28367380797863007, "step": 2380 }, { "epoch": 0.148875, "grad_norm": 3.109375, "grad_norm_var": 0.022391764322916667, "learning_rate": 0.0001, "loss": 8.2447, "loss/crossentropy": 2.4238349199295044, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2638505697250366, "step": 2382 }, { "epoch": 0.149, "grad_norm": 3.046875, "grad_norm_var": 0.023078409830729167, "learning_rate": 0.0001, "loss": 8.1903, "loss/crossentropy": 2.428895592689514, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2814117968082428, "step": 2384 }, { "epoch": 0.149125, "grad_norm": 3.015625, "grad_norm_var": 0.07431233723958333, "learning_rate": 0.0001, "loss": 8.3687, "loss/crossentropy": 2.4161367416381836, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29132652282714844, "step": 2386 }, { "epoch": 0.14925, "grad_norm": 3.0625, "grad_norm_var": 0.06982014973958334, "learning_rate": 0.0001, "loss": 8.3171, "loss/crossentropy": 2.121293306350708, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2542402744293213, "step": 2388 }, { "epoch": 0.149375, "grad_norm": 3.421875, "grad_norm_var": 0.0720123291015625, "learning_rate": 0.0001, "loss": 8.0846, "loss/crossentropy": 2.355462431907654, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.262673944234848, "step": 2390 }, { "epoch": 0.1495, "grad_norm": 4.15625, "grad_norm_var": 0.12470703125, "learning_rate": 0.0001, "loss": 8.5856, "loss/crossentropy": 2.4837170839309692, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27958710491657257, "step": 2392 }, { "epoch": 0.149625, "grad_norm": 6.90625, "grad_norm_var": 0.9296295166015625, "learning_rate": 0.0001, "loss": 8.3578, "loss/crossentropy": 2.365368127822876, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2629378139972687, "step": 2394 }, { "epoch": 0.14975, "grad_norm": 3.4375, "grad_norm_var": 0.9130360921223958, "learning_rate": 0.0001, "loss": 8.2264, "loss/crossentropy": 2.2927812337875366, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2774803042411804, "step": 2396 }, { "epoch": 0.149875, "grad_norm": 3.21875, "grad_norm_var": 0.9198527018229167, "learning_rate": 0.0001, "loss": 8.3089, "loss/crossentropy": 2.3293185234069824, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.25967323780059814, "step": 2398 }, { "epoch": 0.15, "grad_norm": 3.5625, "grad_norm_var": 0.9080891927083333, "learning_rate": 0.0001, "loss": 8.4132, "loss/crossentropy": 2.153883457183838, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26730459928512573, "step": 2400 }, { "epoch": 0.150125, "grad_norm": 3.734375, "grad_norm_var": 0.8680084228515625, "learning_rate": 0.0001, "loss": 8.216, "loss/crossentropy": 2.3419724702835083, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2801203727722168, "step": 2402 }, { "epoch": 0.15025, "grad_norm": 3.46875, "grad_norm_var": 0.8511301676432291, "learning_rate": 0.0001, "loss": 8.5621, "loss/crossentropy": 2.2398444414138794, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2656910568475723, "step": 2404 }, { "epoch": 0.150375, "grad_norm": 3.1875, "grad_norm_var": 0.8526926676432292, "learning_rate": 0.0001, "loss": 8.5041, "loss/crossentropy": 2.604948043823242, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26598772406578064, "step": 2406 }, { "epoch": 0.1505, "grad_norm": 3.296875, "grad_norm_var": 0.8330393473307292, "learning_rate": 0.0001, "loss": 8.5201, "loss/crossentropy": 2.3795058727264404, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2822035402059555, "step": 2408 }, { "epoch": 0.150625, "grad_norm": 3.03125, "grad_norm_var": 0.05833231608072917, "learning_rate": 0.0001, "loss": 8.2627, "loss/crossentropy": 2.440226197242737, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2705456465482712, "step": 2410 }, { "epoch": 0.15075, "grad_norm": 3.265625, "grad_norm_var": 0.05461324055989583, "learning_rate": 0.0001, "loss": 8.4428, "loss/crossentropy": 2.473353385925293, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27890461683273315, "step": 2412 }, { "epoch": 0.150875, "grad_norm": 3.078125, "grad_norm_var": 0.05740559895833333, "learning_rate": 0.0001, "loss": 8.049, "loss/crossentropy": 2.4567281007766724, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.28310708701610565, "step": 2414 }, { "epoch": 0.151, "grad_norm": 3.28125, "grad_norm_var": 0.05642903645833333, "learning_rate": 0.0001, "loss": 8.706, "loss/crossentropy": 2.2676972150802612, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2879829406738281, "step": 2416 }, { "epoch": 0.151125, "grad_norm": 3.46875, "grad_norm_var": 0.045084635416666664, "learning_rate": 0.0001, "loss": 8.6477, "loss/crossentropy": 2.4988842010498047, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2907903641462326, "step": 2418 }, { "epoch": 0.15125, "grad_norm": 3.109375, "grad_norm_var": 0.0454498291015625, "learning_rate": 0.0001, "loss": 8.3035, "loss/crossentropy": 2.390307307243347, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2815091013908386, "step": 2420 }, { "epoch": 0.151375, "grad_norm": 2.984375, "grad_norm_var": 0.050080362955729166, "learning_rate": 0.0001, "loss": 8.1178, "loss/crossentropy": 2.188577175140381, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2619321793317795, "step": 2422 }, { "epoch": 0.1515, "grad_norm": 3.609375, "grad_norm_var": 0.04468994140625, "learning_rate": 0.0001, "loss": 8.7316, "loss/crossentropy": 2.1936429142951965, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2852545231580734, "step": 2424 }, { "epoch": 0.151625, "grad_norm": 3.15625, "grad_norm_var": 0.04058837890625, "learning_rate": 0.0001, "loss": 8.3339, "loss/crossentropy": 2.1987831592559814, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2727145105600357, "step": 2426 }, { "epoch": 0.15175, "grad_norm": 3.171875, "grad_norm_var": 0.043505859375, "learning_rate": 0.0001, "loss": 8.2834, "loss/crossentropy": 2.6830883026123047, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2955752909183502, "step": 2428 }, { "epoch": 0.151875, "grad_norm": 3.125, "grad_norm_var": 0.0397857666015625, "learning_rate": 0.0001, "loss": 8.2492, "loss/crossentropy": 2.3668758869171143, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2798495292663574, "step": 2430 }, { "epoch": 0.152, "grad_norm": 3.015625, "grad_norm_var": 0.032502237955729166, "learning_rate": 0.0001, "loss": 8.219, "loss/crossentropy": 2.3704047203063965, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.259890541434288, "step": 2432 }, { "epoch": 0.152125, "grad_norm": 3.296875, "grad_norm_var": 0.047337849934895836, "learning_rate": 0.0001, "loss": 8.4497, "loss/crossentropy": 2.1782950162887573, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2903287261724472, "step": 2434 }, { "epoch": 0.15225, "grad_norm": 3.09375, "grad_norm_var": 0.04848531087239583, "learning_rate": 0.0001, "loss": 8.4486, "loss/crossentropy": 2.4774245023727417, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27125996351242065, "step": 2436 }, { "epoch": 0.152375, "grad_norm": 3.21875, "grad_norm_var": 0.04543863932291667, "learning_rate": 0.0001, "loss": 8.3811, "loss/crossentropy": 2.3557363748550415, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.30695493519306183, "step": 2438 }, { "epoch": 0.1525, "grad_norm": 3.234375, "grad_norm_var": 0.034235636393229164, "learning_rate": 0.0001, "loss": 8.0553, "loss/crossentropy": 2.362632989883423, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26735785603523254, "step": 2440 }, { "epoch": 0.152625, "grad_norm": 3.078125, "grad_norm_var": 0.03411051432291667, "learning_rate": 0.0001, "loss": 8.179, "loss/crossentropy": 2.4381628036499023, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2577895522117615, "step": 2442 }, { "epoch": 0.15275, "grad_norm": 3.046875, "grad_norm_var": 0.0404449462890625, "learning_rate": 0.0001, "loss": 8.3991, "loss/crossentropy": 2.350975751876831, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27325308322906494, "step": 2444 }, { "epoch": 0.152875, "grad_norm": 3.28125, "grad_norm_var": 0.03941650390625, "learning_rate": 0.0001, "loss": 8.536, "loss/crossentropy": 2.4157899618148804, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27011483907699585, "step": 2446 }, { "epoch": 0.153, "grad_norm": 4.5, "grad_norm_var": 4.062442016601563, "learning_rate": 0.0001, "loss": 8.919, "loss/crossentropy": 2.437688112258911, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.31261374056339264, "step": 2448 }, { "epoch": 0.153125, "grad_norm": 3.234375, "grad_norm_var": 4.074149576822917, "learning_rate": 0.0001, "loss": 8.5219, "loss/crossentropy": 2.4884743690490723, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.281438983976841, "step": 2450 }, { "epoch": 0.15325, "grad_norm": 3.0, "grad_norm_var": 4.093941243489583, "learning_rate": 0.0001, "loss": 8.2878, "loss/crossentropy": 2.3718087673187256, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2740315645933151, "step": 2452 }, { "epoch": 0.153375, "grad_norm": 3.0625, "grad_norm_var": 4.100650024414063, "learning_rate": 0.0001, "loss": 8.5518, "loss/crossentropy": 2.2542319297790527, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.267962783575058, "step": 2454 }, { "epoch": 0.1535, "grad_norm": 3.171875, "grad_norm_var": 4.085700480143229, "learning_rate": 0.0001, "loss": 8.3585, "loss/crossentropy": 2.4759573936462402, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2723170071840286, "step": 2456 }, { "epoch": 0.153625, "grad_norm": 3.1875, "grad_norm_var": 4.062303670247396, "learning_rate": 0.0001, "loss": 8.17, "loss/crossentropy": 2.260656952857971, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25791803002357483, "step": 2458 }, { "epoch": 0.15375, "grad_norm": 3.296875, "grad_norm_var": 4.06168212890625, "learning_rate": 0.0001, "loss": 8.2435, "loss/crossentropy": 2.2933273315429688, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2662912607192993, "step": 2460 }, { "epoch": 0.153875, "grad_norm": 3.125, "grad_norm_var": 4.067943318684896, "learning_rate": 0.0001, "loss": 8.049, "loss/crossentropy": 2.1535520553588867, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2517680525779724, "step": 2462 }, { "epoch": 0.154, "grad_norm": 3.0625, "grad_norm_var": 0.0138580322265625, "learning_rate": 0.0001, "loss": 8.0467, "loss/crossentropy": 2.0775814056396484, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25869153439998627, "step": 2464 }, { "epoch": 0.154125, "grad_norm": 2.984375, "grad_norm_var": 0.012906901041666667, "learning_rate": 0.0001, "loss": 8.1568, "loss/crossentropy": 2.466973304748535, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27992746233940125, "step": 2466 }, { "epoch": 0.15425, "grad_norm": 3.265625, "grad_norm_var": 0.01207275390625, "learning_rate": 0.0001, "loss": 8.0886, "loss/crossentropy": 2.3320833444595337, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27216966450214386, "step": 2468 }, { "epoch": 0.154375, "grad_norm": 3.0625, "grad_norm_var": 0.0121490478515625, "learning_rate": 0.0001, "loss": 8.109, "loss/crossentropy": 2.2403723001480103, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24139465391635895, "step": 2470 }, { "epoch": 0.1545, "grad_norm": 3.109375, "grad_norm_var": 0.010472615559895834, "learning_rate": 0.0001, "loss": 8.56, "loss/crossentropy": 2.1147449016571045, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2546851634979248, "step": 2472 }, { "epoch": 0.154625, "grad_norm": 3.140625, "grad_norm_var": 0.009496053059895834, "learning_rate": 0.0001, "loss": 8.2829, "loss/crossentropy": 2.128211796283722, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2636387571692467, "step": 2474 }, { "epoch": 0.15475, "grad_norm": 3.21875, "grad_norm_var": 0.008250935872395834, "learning_rate": 0.0001, "loss": 8.4869, "loss/crossentropy": 2.5061731338500977, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27180950343608856, "step": 2476 }, { "epoch": 0.154875, "grad_norm": 3.296875, "grad_norm_var": 0.009325154622395833, "learning_rate": 0.0001, "loss": 8.2537, "loss/crossentropy": 1.90971839427948, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2415306195616722, "step": 2478 }, { "epoch": 0.155, "grad_norm": 2.859375, "grad_norm_var": 0.01353759765625, "learning_rate": 0.0001, "loss": 7.9013, "loss/crossentropy": 2.1773123145103455, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2478671371936798, "step": 2480 }, { "epoch": 0.155125, "grad_norm": 6.90625, "grad_norm_var": 0.901953125, "learning_rate": 0.0001, "loss": 8.4798, "loss/crossentropy": 2.3107967376708984, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2525716871023178, "step": 2482 }, { "epoch": 0.15525, "grad_norm": 3.3125, "grad_norm_var": 0.8995513916015625, "learning_rate": 0.0001, "loss": 8.4744, "loss/crossentropy": 2.247706890106201, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.3075176626443863, "step": 2484 }, { "epoch": 0.155375, "grad_norm": 3.203125, "grad_norm_var": 0.89342041015625, "learning_rate": 0.0001, "loss": 8.1333, "loss/crossentropy": 2.3638851642608643, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.272746741771698, "step": 2486 }, { "epoch": 0.1555, "grad_norm": 3.515625, "grad_norm_var": 0.8865071614583333, "learning_rate": 0.0001, "loss": 8.2671, "loss/crossentropy": 2.267830967903137, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.28789058327674866, "step": 2488 }, { "epoch": 0.155625, "grad_norm": 3.140625, "grad_norm_var": 0.8851552327473958, "learning_rate": 0.0001, "loss": 8.1937, "loss/crossentropy": 2.165649652481079, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26410168409347534, "step": 2490 }, { "epoch": 0.15575, "grad_norm": 3.109375, "grad_norm_var": 0.8902496337890625, "learning_rate": 0.0001, "loss": 8.348, "loss/crossentropy": 2.2928093671798706, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2730616182088852, "step": 2492 }, { "epoch": 0.155875, "grad_norm": 3.6875, "grad_norm_var": 1.3131256103515625, "learning_rate": 0.0001, "loss": 9.0116, "loss/crossentropy": 2.288671135902405, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25715528428554535, "step": 2494 }, { "epoch": 0.156, "grad_norm": 3.515625, "grad_norm_var": 1.245361328125, "learning_rate": 0.0001, "loss": 8.2275, "loss/crossentropy": 2.04764986038208, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2550469785928726, "step": 2496 }, { "epoch": 0.156125, "grad_norm": 3.265625, "grad_norm_var": 0.5328196207682292, "learning_rate": 0.0001, "loss": 8.3368, "loss/crossentropy": 2.1027456521987915, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27562327682971954, "step": 2498 }, { "epoch": 0.15625, "grad_norm": 3.375, "grad_norm_var": 0.5341054280598958, "learning_rate": 0.0001, "loss": 8.3857, "loss/crossentropy": 2.3927990198135376, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2799486666917801, "step": 2500 }, { "epoch": 0.156375, "grad_norm": 3.421875, "grad_norm_var": 0.5267567952473958, "learning_rate": 0.0001, "loss": 8.3894, "loss/crossentropy": 2.379324197769165, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27068574726581573, "step": 2502 }, { "epoch": 0.1565, "grad_norm": 3.0625, "grad_norm_var": 0.5542805989583334, "learning_rate": 0.0001, "loss": 8.6141, "loss/crossentropy": 2.1458650827407837, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.3379057049751282, "step": 2504 }, { "epoch": 0.156625, "grad_norm": 3.140625, "grad_norm_var": 0.5693359375, "learning_rate": 0.0001, "loss": 8.3573, "loss/crossentropy": 2.2261852025985718, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27541545033454895, "step": 2506 }, { "epoch": 0.15675, "grad_norm": 3.15625, "grad_norm_var": 0.56923828125, "learning_rate": 0.0001, "loss": 8.2813, "loss/crossentropy": 2.300438404083252, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26637162268161774, "step": 2508 }, { "epoch": 0.156875, "grad_norm": 3.203125, "grad_norm_var": 0.05836181640625, "learning_rate": 0.0001, "loss": 8.1558, "loss/crossentropy": 2.4900972843170166, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26418106257915497, "step": 2510 }, { "epoch": 0.157, "grad_norm": 3.15625, "grad_norm_var": 0.0244781494140625, "learning_rate": 0.0001, "loss": 8.3795, "loss/crossentropy": 2.510451316833496, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2828855812549591, "step": 2512 }, { "epoch": 0.157125, "grad_norm": 3.0625, "grad_norm_var": 0.023856608072916667, "learning_rate": 0.0001, "loss": 8.2548, "loss/crossentropy": 2.3360713720321655, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2564430236816406, "step": 2514 }, { "epoch": 0.15725, "grad_norm": 3.265625, "grad_norm_var": 0.018147786458333332, "learning_rate": 0.0001, "loss": 8.4346, "loss/crossentropy": 2.3846248388290405, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.28445227444171906, "step": 2516 }, { "epoch": 0.157375, "grad_norm": 3.34375, "grad_norm_var": 0.019938151041666668, "learning_rate": 0.0001, "loss": 8.2736, "loss/crossentropy": 2.2677204608917236, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2820749282836914, "step": 2518 }, { "epoch": 0.1575, "grad_norm": 3.0, "grad_norm_var": 0.023974609375, "learning_rate": 0.0001, "loss": 8.2038, "loss/crossentropy": 2.469799518585205, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25989628583192825, "step": 2520 }, { "epoch": 0.157625, "grad_norm": 3.28125, "grad_norm_var": 0.024702962239583334, "learning_rate": 0.0001, "loss": 8.3751, "loss/crossentropy": 2.4882609844207764, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2699503153562546, "step": 2522 }, { "epoch": 0.15775, "grad_norm": 3.25, "grad_norm_var": 0.026123046875, "learning_rate": 0.0001, "loss": 8.4694, "loss/crossentropy": 2.5819085836410522, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27766086161136627, "step": 2524 }, { "epoch": 0.157875, "grad_norm": 2.90625, "grad_norm_var": 0.025072224934895835, "learning_rate": 0.0001, "loss": 8.1619, "loss/crossentropy": 2.2840874791145325, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.29234474897384644, "step": 2526 }, { "epoch": 0.158, "grad_norm": 3.234375, "grad_norm_var": 0.026513671875, "learning_rate": 0.0001, "loss": 8.4086, "loss/crossentropy": 2.4526052474975586, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28289155662059784, "step": 2528 }, { "epoch": 0.158125, "grad_norm": 3.4375, "grad_norm_var": 0.026513671875, "learning_rate": 0.0001, "loss": 8.7414, "loss/crossentropy": 2.654939293861389, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2989393472671509, "step": 2530 }, { "epoch": 0.15825, "grad_norm": 2.953125, "grad_norm_var": 0.030028279622395834, "learning_rate": 0.0001, "loss": 8.2029, "loss/crossentropy": 2.375227451324463, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.29081976413726807, "step": 2532 }, { "epoch": 0.158375, "grad_norm": 2.953125, "grad_norm_var": 0.024201456705729166, "learning_rate": 0.0001, "loss": 8.2599, "loss/crossentropy": 2.2531535625457764, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26515205204486847, "step": 2534 }, { "epoch": 0.1585, "grad_norm": 3.203125, "grad_norm_var": 0.0256744384765625, "learning_rate": 0.0001, "loss": 8.6216, "loss/crossentropy": 2.3937805891036987, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.29868148267269135, "step": 2536 }, { "epoch": 0.158625, "grad_norm": 3.609375, "grad_norm_var": 0.03931884765625, "learning_rate": 0.0001, "loss": 8.7359, "loss/crossentropy": 2.475276470184326, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28303323686122894, "step": 2538 }, { "epoch": 0.15875, "grad_norm": 3.328125, "grad_norm_var": 0.040526326497395834, "learning_rate": 0.0001, "loss": 8.1197, "loss/crossentropy": 2.0302132964134216, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2279423400759697, "step": 2540 }, { "epoch": 0.158875, "grad_norm": 3.0625, "grad_norm_var": 0.036774698893229166, "learning_rate": 0.0001, "loss": 8.4462, "loss/crossentropy": 2.604537010192871, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2865126132965088, "step": 2542 }, { "epoch": 0.159, "grad_norm": 3.109375, "grad_norm_var": 0.03657124837239583, "learning_rate": 0.0001, "loss": 8.4149, "loss/crossentropy": 2.3719130754470825, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2572627142071724, "step": 2544 }, { "epoch": 0.159125, "grad_norm": 3.0625, "grad_norm_var": 0.033772786458333336, "learning_rate": 0.0001, "loss": 8.226, "loss/crossentropy": 2.290672540664673, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.3219170421361923, "step": 2546 }, { "epoch": 0.15925, "grad_norm": 3.046875, "grad_norm_var": 0.031636555989583336, "learning_rate": 0.0001, "loss": 8.2854, "loss/crossentropy": 2.145516276359558, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2706049084663391, "step": 2548 }, { "epoch": 0.159375, "grad_norm": 3.359375, "grad_norm_var": 0.03004150390625, "learning_rate": 0.0001, "loss": 8.2928, "loss/crossentropy": 2.5301170349121094, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2519679069519043, "step": 2550 }, { "epoch": 0.1595, "grad_norm": 3.109375, "grad_norm_var": 0.0247955322265625, "learning_rate": 0.0001, "loss": 8.3513, "loss/crossentropy": 2.4736965894699097, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27385158836841583, "step": 2552 }, { "epoch": 0.159625, "grad_norm": 3.0625, "grad_norm_var": 0.010282389322916667, "learning_rate": 0.0001, "loss": 8.4279, "loss/crossentropy": 2.6575098037719727, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28586356341838837, "step": 2554 }, { "epoch": 0.15975, "grad_norm": 3.203125, "grad_norm_var": 0.010188802083333334, "learning_rate": 0.0001, "loss": 8.3428, "loss/crossentropy": 2.436043381690979, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2793993800878525, "step": 2556 }, { "epoch": 0.159875, "grad_norm": 3.09375, "grad_norm_var": 0.01025390625, "learning_rate": 0.0001, "loss": 8.2126, "loss/crossentropy": 2.388734817504883, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.24712733924388885, "step": 2558 }, { "epoch": 0.16, "grad_norm": 3.3125, "grad_norm_var": 0.012140909830729166, "learning_rate": 0.0001, "loss": 8.341, "loss/crossentropy": 2.488566756248474, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28663991391658783, "step": 2560 }, { "epoch": 0.160125, "grad_norm": 3.0, "grad_norm_var": 0.012626139322916667, "learning_rate": 0.0001, "loss": 8.1953, "loss/crossentropy": 2.360735058784485, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2733890935778618, "step": 2562 }, { "epoch": 0.16025, "grad_norm": 3.125, "grad_norm_var": 0.012262980143229166, "learning_rate": 0.0001, "loss": 8.2249, "loss/crossentropy": 2.49215030670166, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.29839469492435455, "step": 2564 }, { "epoch": 0.160375, "grad_norm": 2.96875, "grad_norm_var": 0.00963134765625, "learning_rate": 0.0001, "loss": 8.2714, "loss/crossentropy": 2.4546769857406616, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2619743049144745, "step": 2566 }, { "epoch": 0.1605, "grad_norm": 3.1875, "grad_norm_var": 0.013232421875, "learning_rate": 0.0001, "loss": 8.3612, "loss/crossentropy": 2.429980993270874, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2659847140312195, "step": 2568 }, { "epoch": 0.160625, "grad_norm": 3.0625, "grad_norm_var": 0.013704427083333333, "learning_rate": 0.0001, "loss": 8.1659, "loss/crossentropy": 2.3688398003578186, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23781683295965195, "step": 2570 }, { "epoch": 0.16075, "grad_norm": 3.046875, "grad_norm_var": 0.026318359375, "learning_rate": 0.0001, "loss": 8.346, "loss/crossentropy": 2.4594470262527466, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2740607261657715, "step": 2572 }, { "epoch": 0.160875, "grad_norm": 3.265625, "grad_norm_var": 0.027985636393229166, "learning_rate": 0.0001, "loss": 8.268, "loss/crossentropy": 2.194278836250305, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26395024359226227, "step": 2574 }, { "epoch": 0.161, "grad_norm": 2.875, "grad_norm_var": 0.029378255208333332, "learning_rate": 0.0001, "loss": 8.1956, "loss/crossentropy": 2.1503721475601196, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24605603516101837, "step": 2576 }, { "epoch": 0.161125, "grad_norm": 3.59375, "grad_norm_var": 0.046971638997395836, "learning_rate": 0.0001, "loss": 8.2973, "loss/crossentropy": 2.331193447113037, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2738886624574661, "step": 2578 }, { "epoch": 0.16125, "grad_norm": 3.09375, "grad_norm_var": 0.046971638997395836, "learning_rate": 0.0001, "loss": 8.3282, "loss/crossentropy": 2.3280651569366455, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.280707448720932, "step": 2580 }, { "epoch": 0.161375, "grad_norm": 3.1875, "grad_norm_var": 0.04517822265625, "learning_rate": 0.0001, "loss": 8.2786, "loss/crossentropy": 2.2947434186935425, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.28385594487190247, "step": 2582 }, { "epoch": 0.1615, "grad_norm": 3.09375, "grad_norm_var": 0.03827718098958333, "learning_rate": 0.0001, "loss": 8.3341, "loss/crossentropy": 2.4629331827163696, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2666812241077423, "step": 2584 }, { "epoch": 0.161625, "grad_norm": 3.25, "grad_norm_var": 0.0423248291015625, "learning_rate": 0.0001, "loss": 8.6395, "loss/crossentropy": 2.167258083820343, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.31057223677635193, "step": 2586 }, { "epoch": 0.16175, "grad_norm": 4.5, "grad_norm_var": 0.1481109619140625, "learning_rate": 0.0001, "loss": 8.2884, "loss/crossentropy": 2.116983652114868, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2682839557528496, "step": 2588 }, { "epoch": 0.161875, "grad_norm": 3.34375, "grad_norm_var": 0.15274149576822918, "learning_rate": 0.0001, "loss": 8.7022, "loss/crossentropy": 2.733883261680603, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.3234509229660034, "step": 2590 }, { "epoch": 0.162, "grad_norm": 3.0625, "grad_norm_var": 0.14871317545572918, "learning_rate": 0.0001, "loss": 8.1477, "loss/crossentropy": 2.146886646747589, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26147788763046265, "step": 2592 }, { "epoch": 0.162125, "grad_norm": 2.921875, "grad_norm_var": 0.14615885416666666, "learning_rate": 0.0001, "loss": 8.1554, "loss/crossentropy": 2.311735153198242, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26781047880649567, "step": 2594 }, { "epoch": 0.16225, "grad_norm": 3.140625, "grad_norm_var": 0.14546610514322916, "learning_rate": 0.0001, "loss": 8.177, "loss/crossentropy": 2.4164129495620728, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26862579584121704, "step": 2596 }, { "epoch": 0.162375, "grad_norm": 3.234375, "grad_norm_var": 0.14485270182291668, "learning_rate": 0.0001, "loss": 8.2389, "loss/crossentropy": 2.1019481420516968, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26436343789100647, "step": 2598 }, { "epoch": 0.1625, "grad_norm": 3.34375, "grad_norm_var": 0.14814046223958333, "learning_rate": 0.0001, "loss": 8.481, "loss/crossentropy": 2.262871265411377, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2700708210468292, "step": 2600 }, { "epoch": 0.162625, "grad_norm": 3.234375, "grad_norm_var": 0.14741923014322916, "learning_rate": 0.0001, "loss": 8.2838, "loss/crossentropy": 2.635279059410095, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.29194803535938263, "step": 2602 }, { "epoch": 0.16275, "grad_norm": 2.984375, "grad_norm_var": 0.030952962239583333, "learning_rate": 0.0001, "loss": 8.2658, "loss/crossentropy": 2.4246195554733276, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2945238947868347, "step": 2604 }, { "epoch": 0.162875, "grad_norm": 3.25, "grad_norm_var": 0.0225738525390625, "learning_rate": 0.0001, "loss": 8.2605, "loss/crossentropy": 2.0880810022354126, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.263704776763916, "step": 2606 }, { "epoch": 0.163, "grad_norm": 3.03125, "grad_norm_var": 0.024706013997395835, "learning_rate": 0.0001, "loss": 8.1831, "loss/crossentropy": 2.080967903137207, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24847671389579773, "step": 2608 }, { "epoch": 0.163125, "grad_norm": 3.125, "grad_norm_var": 0.019270833333333334, "learning_rate": 0.0001, "loss": 8.416, "loss/crossentropy": 2.363895058631897, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.28405146300792694, "step": 2610 }, { "epoch": 0.16325, "grad_norm": 3.0, "grad_norm_var": 0.0218658447265625, "learning_rate": 0.0001, "loss": 8.3748, "loss/crossentropy": 2.4413585662841797, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28752467036247253, "step": 2612 }, { "epoch": 0.163375, "grad_norm": 3.109375, "grad_norm_var": 0.022728474934895833, "learning_rate": 0.0001, "loss": 8.2726, "loss/crossentropy": 2.3740620613098145, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2649526298046112, "step": 2614 }, { "epoch": 0.1635, "grad_norm": 3.203125, "grad_norm_var": 0.027534993489583333, "learning_rate": 0.0001, "loss": 8.3047, "loss/crossentropy": 2.2964380979537964, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2646304666996002, "step": 2616 }, { "epoch": 0.163625, "grad_norm": 3.4375, "grad_norm_var": 0.038736979166666664, "learning_rate": 0.0001, "loss": 8.2126, "loss/crossentropy": 2.1447632908821106, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24735675752162933, "step": 2618 }, { "epoch": 0.16375, "grad_norm": 3.625, "grad_norm_var": 0.0572418212890625, "learning_rate": 0.0001, "loss": 8.5174, "loss/crossentropy": 2.433838725090027, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2876787632703781, "step": 2620 }, { "epoch": 0.163875, "grad_norm": 3.03125, "grad_norm_var": 0.059521484375, "learning_rate": 0.0001, "loss": 8.0701, "loss/crossentropy": 2.204833507537842, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2602345570921898, "step": 2622 }, { "epoch": 0.164, "grad_norm": 3.375, "grad_norm_var": 0.0607086181640625, "learning_rate": 0.0001, "loss": 8.3564, "loss/crossentropy": 2.4403984546661377, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28500233590602875, "step": 2624 }, { "epoch": 0.164125, "grad_norm": 3.25, "grad_norm_var": 0.06425374348958333, "learning_rate": 0.0001, "loss": 8.4073, "loss/crossentropy": 2.461033344268799, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2762896418571472, "step": 2626 }, { "epoch": 0.16425, "grad_norm": 3.359375, "grad_norm_var": 0.0610748291015625, "learning_rate": 0.0001, "loss": 8.2333, "loss/crossentropy": 2.3412665128707886, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2702695280313492, "step": 2628 }, { "epoch": 0.164375, "grad_norm": 3.078125, "grad_norm_var": 0.0517974853515625, "learning_rate": 0.0001, "loss": 8.471, "loss/crossentropy": 2.395104169845581, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.266902893781662, "step": 2630 }, { "epoch": 0.1645, "grad_norm": 3.0625, "grad_norm_var": 0.0504058837890625, "learning_rate": 0.0001, "loss": 8.4257, "loss/crossentropy": 2.5319817066192627, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28416794538497925, "step": 2632 }, { "epoch": 0.164625, "grad_norm": 3.59375, "grad_norm_var": 0.048291015625, "learning_rate": 0.0001, "loss": 8.0315, "loss/crossentropy": 2.1557105779647827, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2669526934623718, "step": 2634 }, { "epoch": 0.16475, "grad_norm": 2.96875, "grad_norm_var": 0.04273681640625, "learning_rate": 0.0001, "loss": 8.3233, "loss/crossentropy": 2.5935251712799072, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.24305777996778488, "step": 2636 }, { "epoch": 0.164875, "grad_norm": 3.1875, "grad_norm_var": 0.033910115559895836, "learning_rate": 0.0001, "loss": 8.4456, "loss/crossentropy": 2.489887237548828, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26338575780391693, "step": 2638 }, { "epoch": 0.165, "grad_norm": 2.859375, "grad_norm_var": 0.03730061848958333, "learning_rate": 0.0001, "loss": 8.0705, "loss/crossentropy": 2.252521276473999, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.24118152260780334, "step": 2640 }, { "epoch": 0.165125, "grad_norm": 3.1875, "grad_norm_var": 0.03505757649739583, "learning_rate": 0.0001, "loss": 8.007, "loss/crossentropy": 2.2367311120033264, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25346408784389496, "step": 2642 }, { "epoch": 0.16525, "grad_norm": 3.15625, "grad_norm_var": 0.03387044270833333, "learning_rate": 0.0001, "loss": 8.2971, "loss/crossentropy": 2.2261271476745605, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2599840611219406, "step": 2644 }, { "epoch": 0.165375, "grad_norm": 3.078125, "grad_norm_var": 0.0338775634765625, "learning_rate": 0.0001, "loss": 8.2174, "loss/crossentropy": 2.3545104265213013, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.268455445766449, "step": 2646 }, { "epoch": 0.1655, "grad_norm": 3.234375, "grad_norm_var": 0.037816365559895836, "learning_rate": 0.0001, "loss": 8.1461, "loss/crossentropy": 2.2744003534317017, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23194655776023865, "step": 2648 }, { "epoch": 0.165625, "grad_norm": 3.140625, "grad_norm_var": 0.0201812744140625, "learning_rate": 0.0001, "loss": 8.4255, "loss/crossentropy": 2.292569160461426, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25596070289611816, "step": 2650 }, { "epoch": 0.16575, "grad_norm": 3.046875, "grad_norm_var": 0.024983723958333332, "learning_rate": 0.0001, "loss": 8.1794, "loss/crossentropy": 2.06977915763855, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.22923216223716736, "step": 2652 }, { "epoch": 0.165875, "grad_norm": 3.25, "grad_norm_var": 0.025641886393229167, "learning_rate": 0.0001, "loss": 8.4059, "loss/crossentropy": 2.262938976287842, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2663477659225464, "step": 2654 }, { "epoch": 0.166, "grad_norm": 3.203125, "grad_norm_var": 0.0209625244140625, "learning_rate": 0.0001, "loss": 8.337, "loss/crossentropy": 1.9289529919624329, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2564796954393387, "step": 2656 }, { "epoch": 0.166125, "grad_norm": 3.171875, "grad_norm_var": 0.0193359375, "learning_rate": 0.0001, "loss": 8.2522, "loss/crossentropy": 2.4027702808380127, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.282346248626709, "step": 2658 }, { "epoch": 0.16625, "grad_norm": 2.859375, "grad_norm_var": 0.0200103759765625, "learning_rate": 0.0001, "loss": 8.1585, "loss/crossentropy": 2.121252119541168, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26361793279647827, "step": 2660 }, { "epoch": 0.166375, "grad_norm": 3.234375, "grad_norm_var": 0.02109375, "learning_rate": 0.0001, "loss": 8.1217, "loss/crossentropy": 2.355001926422119, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25889521837234497, "step": 2662 }, { "epoch": 0.1665, "grad_norm": 3.125, "grad_norm_var": 0.01630859375, "learning_rate": 0.0001, "loss": 8.2613, "loss/crossentropy": 2.3688048124313354, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27535010874271393, "step": 2664 }, { "epoch": 0.166625, "grad_norm": 3.140625, "grad_norm_var": 0.014225260416666666, "learning_rate": 0.0001, "loss": 8.0862, "loss/crossentropy": 2.3740497827529907, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2727824002504349, "step": 2666 }, { "epoch": 0.16675, "grad_norm": 3.1875, "grad_norm_var": 0.015738932291666667, "learning_rate": 0.0001, "loss": 8.3176, "loss/crossentropy": 2.3888481855392456, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2630738615989685, "step": 2668 }, { "epoch": 0.166875, "grad_norm": 3.03125, "grad_norm_var": 0.016942342122395832, "learning_rate": 0.0001, "loss": 8.156, "loss/crossentropy": 2.0409966707229614, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2550422251224518, "step": 2670 }, { "epoch": 0.167, "grad_norm": 3.171875, "grad_norm_var": 0.017236328125, "learning_rate": 0.0001, "loss": 8.4412, "loss/crossentropy": 2.5270535945892334, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.29448163509368896, "step": 2672 }, { "epoch": 0.167125, "grad_norm": 2.96875, "grad_norm_var": 0.028661092122395832, "learning_rate": 0.0001, "loss": 8.0915, "loss/crossentropy": 2.235354781150818, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26712487637996674, "step": 2674 }, { "epoch": 0.16725, "grad_norm": 3.09375, "grad_norm_var": 0.0249664306640625, "learning_rate": 0.0001, "loss": 7.9645, "loss/crossentropy": 2.1596986055374146, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2632744610309601, "step": 2676 }, { "epoch": 0.167375, "grad_norm": 3.03125, "grad_norm_var": 0.0266510009765625, "learning_rate": 0.0001, "loss": 8.1883, "loss/crossentropy": 2.2907787561416626, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2716197818517685, "step": 2678 }, { "epoch": 0.1675, "grad_norm": 2.796875, "grad_norm_var": 0.0345367431640625, "learning_rate": 0.0001, "loss": 8.1149, "loss/crossentropy": 2.200819969177246, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.28321780264377594, "step": 2680 }, { "epoch": 0.167625, "grad_norm": 3.0625, "grad_norm_var": 0.03452860514322917, "learning_rate": 0.0001, "loss": 8.3545, "loss/crossentropy": 2.5655312538146973, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26827070116996765, "step": 2682 }, { "epoch": 0.16775, "grad_norm": 2.875, "grad_norm_var": 0.02847900390625, "learning_rate": 0.0001, "loss": 7.964, "loss/crossentropy": 2.368655800819397, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25074201822280884, "step": 2684 }, { "epoch": 0.167875, "grad_norm": 2.890625, "grad_norm_var": 0.030647786458333333, "learning_rate": 0.0001, "loss": 8.2303, "loss/crossentropy": 2.522045135498047, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.289681613445282, "step": 2686 }, { "epoch": 0.168, "grad_norm": 3.109375, "grad_norm_var": 0.0275299072265625, "learning_rate": 0.0001, "loss": 8.212, "loss/crossentropy": 2.3031221628189087, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2869686484336853, "step": 2688 }, { "epoch": 0.168125, "grad_norm": 3.1875, "grad_norm_var": 0.014427693684895833, "learning_rate": 0.0001, "loss": 8.1903, "loss/crossentropy": 2.2180505990982056, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2845149040222168, "step": 2690 }, { "epoch": 0.16825, "grad_norm": 3.078125, "grad_norm_var": 0.014420572916666667, "learning_rate": 0.0001, "loss": 8.1318, "loss/crossentropy": 2.1774216294288635, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27391664683818817, "step": 2692 }, { "epoch": 0.168375, "grad_norm": 2.90625, "grad_norm_var": 0.016747029622395833, "learning_rate": 0.0001, "loss": 8.1924, "loss/crossentropy": 2.3473572731018066, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.25838781893253326, "step": 2694 }, { "epoch": 0.1685, "grad_norm": 3.546875, "grad_norm_var": 0.028385416666666666, "learning_rate": 0.0001, "loss": 8.1097, "loss/crossentropy": 2.3881657123565674, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2789221554994583, "step": 2696 }, { "epoch": 0.168625, "grad_norm": 3.3125, "grad_norm_var": 0.061421712239583336, "learning_rate": 0.0001, "loss": 8.1672, "loss/crossentropy": 2.1731218099594116, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24963228404521942, "step": 2698 }, { "epoch": 0.16875, "grad_norm": 3.125, "grad_norm_var": 0.051423136393229166, "learning_rate": 0.0001, "loss": 8.3729, "loss/crossentropy": 2.2801939249038696, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26613737642765045, "step": 2700 }, { "epoch": 0.168875, "grad_norm": 3.25, "grad_norm_var": 0.045653279622395834, "learning_rate": 0.0001, "loss": 8.3193, "loss/crossentropy": 2.245741128921509, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27671462297439575, "step": 2702 }, { "epoch": 0.169, "grad_norm": 3.125, "grad_norm_var": 0.043680826822916664, "learning_rate": 0.0001, "loss": 8.3377, "loss/crossentropy": 2.2457879781723022, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26047301292419434, "step": 2704 }, { "epoch": 0.169125, "grad_norm": 3.125, "grad_norm_var": 0.04413960774739583, "learning_rate": 0.0001, "loss": 8.177, "loss/crossentropy": 2.5720086097717285, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2824274003505707, "step": 2706 }, { "epoch": 0.16925, "grad_norm": 3.078125, "grad_norm_var": 0.04474995930989583, "learning_rate": 0.0001, "loss": 8.2283, "loss/crossentropy": 2.1722124814987183, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28728562593460083, "step": 2708 }, { "epoch": 0.169375, "grad_norm": 3.140625, "grad_norm_var": 0.04452718098958333, "learning_rate": 0.0001, "loss": 8.3363, "loss/crossentropy": 2.4735668897628784, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.279041588306427, "step": 2710 }, { "epoch": 0.1695, "grad_norm": 3.125, "grad_norm_var": 0.03596903483072917, "learning_rate": 0.0001, "loss": 8.3597, "loss/crossentropy": 2.4349948167800903, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.3199606239795685, "step": 2712 }, { "epoch": 0.169625, "grad_norm": 2.984375, "grad_norm_var": 0.015458170572916667, "learning_rate": 0.0001, "loss": 8.3392, "loss/crossentropy": 2.2743479013442993, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25963588804006577, "step": 2714 }, { "epoch": 0.16975, "grad_norm": 2.984375, "grad_norm_var": 0.017975870768229166, "learning_rate": 0.0001, "loss": 8.1114, "loss/crossentropy": 2.3894020318984985, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26319143176078796, "step": 2716 }, { "epoch": 0.169875, "grad_norm": 2.90625, "grad_norm_var": 0.022412109375, "learning_rate": 0.0001, "loss": 8.119, "loss/crossentropy": 2.4183582067489624, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2642783522605896, "step": 2718 }, { "epoch": 0.17, "grad_norm": 3.203125, "grad_norm_var": 0.025560506184895835, "learning_rate": 0.0001, "loss": 8.1457, "loss/crossentropy": 2.218894124031067, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2494705617427826, "step": 2720 }, { "epoch": 0.170125, "grad_norm": 3.21875, "grad_norm_var": 0.029157511393229165, "learning_rate": 0.0001, "loss": 8.4283, "loss/crossentropy": 2.3041937351226807, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27814508974552155, "step": 2722 }, { "epoch": 0.17025, "grad_norm": 2.953125, "grad_norm_var": 0.03127848307291667, "learning_rate": 0.0001, "loss": 8.2177, "loss/crossentropy": 2.212631940841675, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25353027880191803, "step": 2724 }, { "epoch": 0.170375, "grad_norm": 3.375, "grad_norm_var": 0.024665323893229167, "learning_rate": 0.0001, "loss": 8.2822, "loss/crossentropy": 2.395217537879944, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2819966673851013, "step": 2726 }, { "epoch": 0.1705, "grad_norm": 3.09375, "grad_norm_var": 0.027074178059895832, "learning_rate": 0.0001, "loss": 8.6286, "loss/crossentropy": 2.7145198583602905, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.26581692695617676, "step": 2728 }, { "epoch": 0.170625, "grad_norm": 3.1875, "grad_norm_var": 0.026227823893229165, "learning_rate": 0.0001, "loss": 8.3205, "loss/crossentropy": 2.30352520942688, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2516012564301491, "step": 2730 }, { "epoch": 0.17075, "grad_norm": 2.9375, "grad_norm_var": 0.0263336181640625, "learning_rate": 0.0001, "loss": 8.1426, "loss/crossentropy": 2.5575900077819824, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25674693286418915, "step": 2732 }, { "epoch": 0.170875, "grad_norm": 3.203125, "grad_norm_var": 0.02105712890625, "learning_rate": 0.0001, "loss": 8.2278, "loss/crossentropy": 2.5535298585891724, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2858099341392517, "step": 2734 }, { "epoch": 0.171, "grad_norm": 3.09375, "grad_norm_var": 0.018220011393229166, "learning_rate": 0.0001, "loss": 7.9985, "loss/crossentropy": 2.249255061149597, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24624134600162506, "step": 2736 }, { "epoch": 0.171125, "grad_norm": 2.890625, "grad_norm_var": 0.019391886393229165, "learning_rate": 0.0001, "loss": 8.2368, "loss/crossentropy": 2.3054301738739014, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.255063071846962, "step": 2738 }, { "epoch": 0.17125, "grad_norm": 3.046875, "grad_norm_var": 0.017801920572916668, "learning_rate": 0.0001, "loss": 8.3647, "loss/crossentropy": 2.4051593542099, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.3074689656496048, "step": 2740 }, { "epoch": 0.171375, "grad_norm": 3.359375, "grad_norm_var": 0.017365519205729166, "learning_rate": 0.0001, "loss": 8.1814, "loss/crossentropy": 2.451170325279236, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2629931569099426, "step": 2742 }, { "epoch": 0.1715, "grad_norm": 3.078125, "grad_norm_var": 0.014802042643229167, "learning_rate": 0.0001, "loss": 8.2261, "loss/crossentropy": 2.5258573293685913, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28129981458187103, "step": 2744 }, { "epoch": 0.171625, "grad_norm": 3.046875, "grad_norm_var": 0.014232381184895834, "learning_rate": 0.0001, "loss": 8.2026, "loss/crossentropy": 2.59726619720459, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27656108140945435, "step": 2746 }, { "epoch": 0.17175, "grad_norm": 3.09375, "grad_norm_var": 0.012984212239583333, "learning_rate": 0.0001, "loss": 8.3833, "loss/crossentropy": 2.590458631515503, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2706581801176071, "step": 2748 }, { "epoch": 0.171875, "grad_norm": 2.96875, "grad_norm_var": 0.014058430989583334, "learning_rate": 0.0001, "loss": 8.0601, "loss/crossentropy": 2.3810765743255615, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.24706510454416275, "step": 2750 }, { "epoch": 0.172, "grad_norm": 2.921875, "grad_norm_var": 0.015360514322916666, "learning_rate": 0.0001, "loss": 8.0355, "loss/crossentropy": 2.258496880531311, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26913100481033325, "step": 2752 }, { "epoch": 0.172125, "grad_norm": 3.296875, "grad_norm_var": 0.0178131103515625, "learning_rate": 0.0001, "loss": 8.2945, "loss/crossentropy": 2.477080225944519, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26651376485824585, "step": 2754 }, { "epoch": 0.17225, "grad_norm": 3.09375, "grad_norm_var": 0.0175933837890625, "learning_rate": 0.0001, "loss": 8.1, "loss/crossentropy": 2.3166109323501587, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24783167243003845, "step": 2756 }, { "epoch": 0.172375, "grad_norm": 2.875, "grad_norm_var": 0.014012654622395834, "learning_rate": 0.0001, "loss": 8.1753, "loss/crossentropy": 2.3947906494140625, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2784908413887024, "step": 2758 }, { "epoch": 0.1725, "grad_norm": 3.171875, "grad_norm_var": 0.0149078369140625, "learning_rate": 0.0001, "loss": 8.298, "loss/crossentropy": 2.308673143386841, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2578892260789871, "step": 2760 }, { "epoch": 0.172625, "grad_norm": 3.140625, "grad_norm_var": 0.015816243489583333, "learning_rate": 0.0001, "loss": 8.277, "loss/crossentropy": 2.410847306251526, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2580881118774414, "step": 2762 }, { "epoch": 0.17275, "grad_norm": 2.90625, "grad_norm_var": 0.0188385009765625, "learning_rate": 0.0001, "loss": 8.3098, "loss/crossentropy": 2.4470525979995728, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25711022317409515, "step": 2764 }, { "epoch": 0.172875, "grad_norm": 3.125, "grad_norm_var": 0.018603515625, "learning_rate": 0.0001, "loss": 8.384, "loss/crossentropy": 2.5611066818237305, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2998035103082657, "step": 2766 }, { "epoch": 0.173, "grad_norm": 2.984375, "grad_norm_var": 0.016258748372395833, "learning_rate": 0.0001, "loss": 8.3617, "loss/crossentropy": 2.5504335165023804, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27010831236839294, "step": 2768 }, { "epoch": 0.173125, "grad_norm": 2.703125, "grad_norm_var": 0.020897420247395833, "learning_rate": 0.0001, "loss": 8.0711, "loss/crossentropy": 2.2784290313720703, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.25674042105674744, "step": 2770 }, { "epoch": 0.17325, "grad_norm": 3.25, "grad_norm_var": 0.022782389322916666, "learning_rate": 0.0001, "loss": 8.2164, "loss/crossentropy": 2.421698570251465, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2708132416009903, "step": 2772 }, { "epoch": 0.173375, "grad_norm": 3.703125, "grad_norm_var": 0.05318603515625, "learning_rate": 0.0001, "loss": 8.2761, "loss/crossentropy": 2.1522774696350098, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24617627263069153, "step": 2774 }, { "epoch": 0.1735, "grad_norm": 3.09375, "grad_norm_var": 0.0539459228515625, "learning_rate": 0.0001, "loss": 8.3435, "loss/crossentropy": 2.5082781314849854, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27957382798194885, "step": 2776 }, { "epoch": 0.173625, "grad_norm": 3.21875, "grad_norm_var": 0.05487874348958333, "learning_rate": 0.0001, "loss": 8.0423, "loss/crossentropy": 2.2934749126434326, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2753216177225113, "step": 2778 }, { "epoch": 0.17375, "grad_norm": 2.984375, "grad_norm_var": 0.05186258951822917, "learning_rate": 0.0001, "loss": 8.3362, "loss/crossentropy": 2.4466487169265747, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2686302661895752, "step": 2780 }, { "epoch": 0.173875, "grad_norm": 3.0, "grad_norm_var": 0.05314839680989583, "learning_rate": 0.0001, "loss": 8.2283, "loss/crossentropy": 2.2464054822921753, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27123279869556427, "step": 2782 }, { "epoch": 0.174, "grad_norm": 3.046875, "grad_norm_var": 0.0550933837890625, "learning_rate": 0.0001, "loss": 8.2158, "loss/crossentropy": 2.3913623094558716, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27313700318336487, "step": 2784 }, { "epoch": 0.174125, "grad_norm": 3.0625, "grad_norm_var": 0.0407623291015625, "learning_rate": 0.0001, "loss": 8.2836, "loss/crossentropy": 2.512578248977661, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2598787695169449, "step": 2786 }, { "epoch": 0.17425, "grad_norm": 2.984375, "grad_norm_var": 0.04078369140625, "learning_rate": 0.0001, "loss": 8.0561, "loss/crossentropy": 2.5069663524627686, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.273094579577446, "step": 2788 }, { "epoch": 0.174375, "grad_norm": 3.15625, "grad_norm_var": 0.012300618489583333, "learning_rate": 0.0001, "loss": 8.1484, "loss/crossentropy": 2.0232608318328857, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2657552883028984, "step": 2790 }, { "epoch": 0.1745, "grad_norm": 2.96875, "grad_norm_var": 0.017220052083333333, "learning_rate": 0.0001, "loss": 8.4669, "loss/crossentropy": 2.387427568435669, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2625594586133957, "step": 2792 }, { "epoch": 0.174625, "grad_norm": 3.03125, "grad_norm_var": 0.016405232747395835, "learning_rate": 0.0001, "loss": 8.3922, "loss/crossentropy": 2.3673804998397827, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28117281198501587, "step": 2794 }, { "epoch": 0.17475, "grad_norm": 3.265625, "grad_norm_var": 0.03388264973958333, "learning_rate": 0.0001, "loss": 8.2787, "loss/crossentropy": 2.4962185621261597, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.286916047334671, "step": 2796 }, { "epoch": 0.174875, "grad_norm": 3.40625, "grad_norm_var": 0.03957926432291667, "learning_rate": 0.0001, "loss": 8.2466, "loss/crossentropy": 2.418305993080139, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2539825364947319, "step": 2798 }, { "epoch": 0.175, "grad_norm": 2.984375, "grad_norm_var": 0.038304646809895836, "learning_rate": 0.0001, "loss": 8.1027, "loss/crossentropy": 2.1551238298416138, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2831410765647888, "step": 2800 }, { "epoch": 0.175125, "grad_norm": 3.125, "grad_norm_var": 0.03675130208333333, "learning_rate": 0.0001, "loss": 8.3028, "loss/crossentropy": 2.063372015953064, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2392185851931572, "step": 2802 }, { "epoch": 0.17525, "grad_norm": 3.25, "grad_norm_var": 0.03528645833333333, "learning_rate": 0.0001, "loss": 8.3566, "loss/crossentropy": 2.4662251472473145, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2643844783306122, "step": 2804 }, { "epoch": 0.175375, "grad_norm": 3.03125, "grad_norm_var": 0.03893229166666667, "learning_rate": 0.0001, "loss": 8.2343, "loss/crossentropy": 2.275088667869568, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25276893377304077, "step": 2806 }, { "epoch": 0.1755, "grad_norm": 3.15625, "grad_norm_var": 0.030692545572916667, "learning_rate": 0.0001, "loss": 8.2212, "loss/crossentropy": 2.20920592546463, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25742238759994507, "step": 2808 }, { "epoch": 0.175625, "grad_norm": 3.125, "grad_norm_var": 0.028450520833333333, "learning_rate": 0.0001, "loss": 8.4952, "loss/crossentropy": 2.378191828727722, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.3033263385295868, "step": 2810 }, { "epoch": 0.17575, "grad_norm": 3.09375, "grad_norm_var": 0.0159332275390625, "learning_rate": 0.0001, "loss": 8.3287, "loss/crossentropy": 2.3606228828430176, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26286470890045166, "step": 2812 }, { "epoch": 0.175875, "grad_norm": 3.171875, "grad_norm_var": 0.0115631103515625, "learning_rate": 0.0001, "loss": 8.0903, "loss/crossentropy": 2.5904735326766968, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.28542497754096985, "step": 2814 }, { "epoch": 0.176, "grad_norm": 3.015625, "grad_norm_var": 0.010970052083333333, "learning_rate": 0.0001, "loss": 8.2841, "loss/crossentropy": 2.1312711238861084, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2546592205762863, "step": 2816 }, { "epoch": 0.176125, "grad_norm": 3.203125, "grad_norm_var": 0.011747233072916667, "learning_rate": 0.0001, "loss": 8.3833, "loss/crossentropy": 2.4488954544067383, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27959881722927094, "step": 2818 }, { "epoch": 0.17625, "grad_norm": 3.125, "grad_norm_var": 0.010041300455729167, "learning_rate": 0.0001, "loss": 8.364, "loss/crossentropy": 2.4785863161087036, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26612453162670135, "step": 2820 }, { "epoch": 0.176375, "grad_norm": 3.171875, "grad_norm_var": 0.008503214518229166, "learning_rate": 0.0001, "loss": 8.1876, "loss/crossentropy": 2.344822645187378, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24801667034626007, "step": 2822 }, { "epoch": 0.1765, "grad_norm": 2.984375, "grad_norm_var": 0.009000651041666667, "learning_rate": 0.0001, "loss": 8.2002, "loss/crossentropy": 2.2322527170181274, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23924288898706436, "step": 2824 }, { "epoch": 0.176625, "grad_norm": 2.84375, "grad_norm_var": 0.011747233072916667, "learning_rate": 0.0001, "loss": 8.2038, "loss/crossentropy": 2.2701025009155273, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24792873859405518, "step": 2826 }, { "epoch": 0.17675, "grad_norm": 2.8125, "grad_norm_var": 0.017313639322916668, "learning_rate": 0.0001, "loss": 8.0395, "loss/crossentropy": 2.2964216470718384, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24705877900123596, "step": 2828 }, { "epoch": 0.176875, "grad_norm": 2.953125, "grad_norm_var": 0.017378743489583334, "learning_rate": 0.0001, "loss": 8.4679, "loss/crossentropy": 2.418280839920044, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.3100636601448059, "step": 2830 }, { "epoch": 0.177, "grad_norm": 3.375, "grad_norm_var": 0.026488240559895834, "learning_rate": 0.0001, "loss": 8.1278, "loss/crossentropy": 2.4353508949279785, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2712879627943039, "step": 2832 }, { "epoch": 0.177125, "grad_norm": 2.96875, "grad_norm_var": 0.023363240559895835, "learning_rate": 0.0001, "loss": 8.4328, "loss/crossentropy": 2.277379631996155, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2677570581436157, "step": 2834 }, { "epoch": 0.17725, "grad_norm": 3.078125, "grad_norm_var": 0.024348958333333334, "learning_rate": 0.0001, "loss": 7.8742, "loss/crossentropy": 2.3050626516342163, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25615155696868896, "step": 2836 }, { "epoch": 0.177375, "grad_norm": 2.671875, "grad_norm_var": 0.031102498372395832, "learning_rate": 0.0001, "loss": 8.1044, "loss/crossentropy": 2.2054827213287354, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26621413230895996, "step": 2838 }, { "epoch": 0.1775, "grad_norm": 3.40625, "grad_norm_var": 0.04169820149739583, "learning_rate": 0.0001, "loss": 8.0213, "loss/crossentropy": 2.344091534614563, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26311126351356506, "step": 2840 }, { "epoch": 0.177625, "grad_norm": 2.9375, "grad_norm_var": 0.039525349934895836, "learning_rate": 0.0001, "loss": 8.2404, "loss/crossentropy": 2.3576395511627197, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26957815885543823, "step": 2842 }, { "epoch": 0.17775, "grad_norm": 3.03125, "grad_norm_var": 0.03683268229166667, "learning_rate": 0.0001, "loss": 8.1148, "loss/crossentropy": 2.0740586519241333, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2444992959499359, "step": 2844 }, { "epoch": 0.177875, "grad_norm": 2.875, "grad_norm_var": 0.03797098795572917, "learning_rate": 0.0001, "loss": 8.0264, "loss/crossentropy": 2.196990489959717, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2584913969039917, "step": 2846 }, { "epoch": 0.178, "grad_norm": 3.078125, "grad_norm_var": 0.029931640625, "learning_rate": 0.0001, "loss": 8.0912, "loss/crossentropy": 2.465600848197937, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.31103046238422394, "step": 2848 }, { "epoch": 0.178125, "grad_norm": 3.390625, "grad_norm_var": 0.03961588541666667, "learning_rate": 0.0001, "loss": 8.6973, "loss/crossentropy": 2.542473077774048, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2756101042032242, "step": 2850 }, { "epoch": 0.17825, "grad_norm": 3.046875, "grad_norm_var": 0.038248697916666664, "learning_rate": 0.0001, "loss": 8.0748, "loss/crossentropy": 2.357996344566345, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2627618610858917, "step": 2852 }, { "epoch": 0.178375, "grad_norm": 3.125, "grad_norm_var": 0.0279449462890625, "learning_rate": 0.0001, "loss": 8.2433, "loss/crossentropy": 2.387884736061096, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2805600166320801, "step": 2854 }, { "epoch": 0.1785, "grad_norm": 3.21875, "grad_norm_var": 0.02301025390625, "learning_rate": 0.0001, "loss": 8.2198, "loss/crossentropy": 2.2316168546676636, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2630866765975952, "step": 2856 }, { "epoch": 0.178625, "grad_norm": 3.171875, "grad_norm_var": 0.024898274739583334, "learning_rate": 0.0001, "loss": 8.1996, "loss/crossentropy": 2.388888955116272, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2638123258948326, "step": 2858 }, { "epoch": 0.17875, "grad_norm": 2.890625, "grad_norm_var": 0.025519816080729167, "learning_rate": 0.0001, "loss": 8.2193, "loss/crossentropy": 2.575498104095459, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2572275549173355, "step": 2860 }, { "epoch": 0.178875, "grad_norm": 3.171875, "grad_norm_var": 0.024837239583333334, "learning_rate": 0.0001, "loss": 8.3578, "loss/crossentropy": 2.524762511253357, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.3061821162700653, "step": 2862 }, { "epoch": 0.179, "grad_norm": 3.125, "grad_norm_var": 0.025716145833333332, "learning_rate": 0.0001, "loss": 8.2261, "loss/crossentropy": 2.2874268293380737, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26778069138526917, "step": 2864 }, { "epoch": 0.179125, "grad_norm": 3.203125, "grad_norm_var": 0.022591145833333333, "learning_rate": 0.0001, "loss": 8.3879, "loss/crossentropy": 2.28354811668396, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26096734404563904, "step": 2866 }, { "epoch": 0.17925, "grad_norm": 3.15625, "grad_norm_var": 0.025211588541666666, "learning_rate": 0.0001, "loss": 8.2155, "loss/crossentropy": 2.1607202291488647, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26596730947494507, "step": 2868 }, { "epoch": 0.179375, "grad_norm": 3.53125, "grad_norm_var": 0.0307769775390625, "learning_rate": 0.0001, "loss": 8.2848, "loss/crossentropy": 2.4421184062957764, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25268222391605377, "step": 2870 }, { "epoch": 0.1795, "grad_norm": 3.078125, "grad_norm_var": 0.03218994140625, "learning_rate": 0.0001, "loss": 8.184, "loss/crossentropy": 2.24539315700531, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2509969249367714, "step": 2872 }, { "epoch": 0.179625, "grad_norm": 2.9375, "grad_norm_var": 0.031126912434895834, "learning_rate": 0.0001, "loss": 8.0381, "loss/crossentropy": 2.1964842081069946, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.261972077190876, "step": 2874 }, { "epoch": 0.17975, "grad_norm": 3.140625, "grad_norm_var": 0.026756795247395833, "learning_rate": 0.0001, "loss": 8.2261, "loss/crossentropy": 2.2588162422180176, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28808237612247467, "step": 2876 }, { "epoch": 0.179875, "grad_norm": 2.890625, "grad_norm_var": 0.03379618326822917, "learning_rate": 0.0001, "loss": 8.163, "loss/crossentropy": 2.177803933620453, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27027180790901184, "step": 2878 }, { "epoch": 0.18, "grad_norm": 3.109375, "grad_norm_var": 0.030500284830729165, "learning_rate": 0.0001, "loss": 8.236, "loss/crossentropy": 2.0715816020965576, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25250908732414246, "step": 2880 }, { "epoch": 0.180125, "grad_norm": 2.953125, "grad_norm_var": 0.031086222330729166, "learning_rate": 0.0001, "loss": 8.1597, "loss/crossentropy": 2.3349109888076782, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2642297148704529, "step": 2882 }, { "epoch": 0.18025, "grad_norm": 3.21875, "grad_norm_var": 0.02681884765625, "learning_rate": 0.0001, "loss": 8.0044, "loss/crossentropy": 2.1576240062713623, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25084151327610016, "step": 2884 }, { "epoch": 0.180375, "grad_norm": 2.921875, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 8.0148, "loss/crossentropy": 2.1341161131858826, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23193783313035965, "step": 2886 }, { "epoch": 0.1805, "grad_norm": 3.0, "grad_norm_var": 0.010856119791666667, "learning_rate": 0.0001, "loss": 8.1343, "loss/crossentropy": 2.2248082160949707, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26918257772922516, "step": 2888 }, { "epoch": 0.180625, "grad_norm": 3.03125, "grad_norm_var": 0.010334269205729166, "learning_rate": 0.0001, "loss": 8.187, "loss/crossentropy": 2.5334991216659546, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.29171979427337646, "step": 2890 }, { "epoch": 0.18075, "grad_norm": 3.109375, "grad_norm_var": 0.010367838541666667, "learning_rate": 0.0001, "loss": 8.2264, "loss/crossentropy": 2.3791427612304688, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27077220380306244, "step": 2892 }, { "epoch": 0.180875, "grad_norm": 3.21875, "grad_norm_var": 0.0108306884765625, "learning_rate": 0.0001, "loss": 8.0115, "loss/crossentropy": 2.189573884010315, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.268039345741272, "step": 2894 }, { "epoch": 0.181, "grad_norm": 3.015625, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 8.3331, "loss/crossentropy": 2.399560272693634, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.24194006621837616, "step": 2896 }, { "epoch": 0.181125, "grad_norm": 3.03125, "grad_norm_var": 0.010184733072916667, "learning_rate": 0.0001, "loss": 8.1515, "loss/crossentropy": 2.140601396560669, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24887196719646454, "step": 2898 }, { "epoch": 0.18125, "grad_norm": 2.75, "grad_norm_var": 0.014872233072916666, "learning_rate": 0.0001, "loss": 7.7282, "loss/crossentropy": 2.0853304862976074, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2543262392282486, "step": 2900 }, { "epoch": 0.181375, "grad_norm": 3.75, "grad_norm_var": 1.08092041015625, "learning_rate": 0.0001, "loss": 8.8119, "loss/crossentropy": 2.390069603919983, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.3597041964530945, "step": 2902 }, { "epoch": 0.1815, "grad_norm": 3.421875, "grad_norm_var": 1.06201171875, "learning_rate": 0.0001, "loss": 8.1467, "loss/crossentropy": 2.2973451614379883, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.30557607114315033, "step": 2904 }, { "epoch": 0.181625, "grad_norm": 3.171875, "grad_norm_var": 1.0474680582682292, "learning_rate": 0.0001, "loss": 8.1066, "loss/crossentropy": 2.217726707458496, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2568957358598709, "step": 2906 }, { "epoch": 0.18175, "grad_norm": 3.484375, "grad_norm_var": 1.03580322265625, "learning_rate": 0.0001, "loss": 8.2859, "loss/crossentropy": 2.403484344482422, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2820377051830292, "step": 2908 }, { "epoch": 0.181875, "grad_norm": 2.96875, "grad_norm_var": 1.0566721598307292, "learning_rate": 0.0001, "loss": 8.1078, "loss/crossentropy": 2.49627947807312, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2758508771657944, "step": 2910 }, { "epoch": 0.182, "grad_norm": 2.921875, "grad_norm_var": 1.066657511393229, "learning_rate": 0.0001, "loss": 8.08, "loss/crossentropy": 2.4209847450256348, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.276279091835022, "step": 2912 }, { "epoch": 0.182125, "grad_norm": 3.03125, "grad_norm_var": 1.0872233072916666, "learning_rate": 0.0001, "loss": 8.0057, "loss/crossentropy": 2.4141104221343994, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25281232595443726, "step": 2914 }, { "epoch": 0.18225, "grad_norm": 3.71875, "grad_norm_var": 1.044189453125, "learning_rate": 0.0001, "loss": 8.4177, "loss/crossentropy": 2.1968607902526855, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2601237893104553, "step": 2916 }, { "epoch": 0.182375, "grad_norm": 4.09375, "grad_norm_var": 0.12899983723958333, "learning_rate": 0.0001, "loss": 8.6519, "loss/crossentropy": 2.355108380317688, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2877423167228699, "step": 2918 }, { "epoch": 0.1825, "grad_norm": 2.984375, "grad_norm_var": 0.13144429524739584, "learning_rate": 0.0001, "loss": 7.8936, "loss/crossentropy": 2.0170373916625977, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.23654960840940475, "step": 2920 }, { "epoch": 0.182625, "grad_norm": 3.1875, "grad_norm_var": 0.13209228515625, "learning_rate": 0.0001, "loss": 8.0005, "loss/crossentropy": 2.3714324235916138, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24751296639442444, "step": 2922 }, { "epoch": 0.18275, "grad_norm": 2.9375, "grad_norm_var": 0.12609049479166667, "learning_rate": 0.0001, "loss": 8.1466, "loss/crossentropy": 2.2555553913116455, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24768846482038498, "step": 2924 }, { "epoch": 0.182875, "grad_norm": 3.09375, "grad_norm_var": 0.12849019368489584, "learning_rate": 0.0001, "loss": 8.0836, "loss/crossentropy": 1.9109330773353577, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25811461359262466, "step": 2926 }, { "epoch": 0.183, "grad_norm": 3.09375, "grad_norm_var": 0.13059488932291666, "learning_rate": 0.0001, "loss": 8.2019, "loss/crossentropy": 2.4296613931655884, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2532989978790283, "step": 2928 }, { "epoch": 0.183125, "grad_norm": 3.109375, "grad_norm_var": 0.12439778645833334, "learning_rate": 0.0001, "loss": 8.5959, "loss/crossentropy": 2.5520033836364746, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2584295719861984, "step": 2930 }, { "epoch": 0.18325, "grad_norm": 3.234375, "grad_norm_var": 0.10261128743489584, "learning_rate": 0.0001, "loss": 8.167, "loss/crossentropy": 2.2617307901382446, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.25226905941963196, "step": 2932 }, { "epoch": 0.183375, "grad_norm": 3.640625, "grad_norm_var": 0.05717671712239583, "learning_rate": 0.0001, "loss": 8.476, "loss/crossentropy": 2.2038698196411133, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27726230025291443, "step": 2934 }, { "epoch": 0.1835, "grad_norm": 3.40625, "grad_norm_var": 0.05496419270833333, "learning_rate": 0.0001, "loss": 8.2697, "loss/crossentropy": 2.011132597923279, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24731668829917908, "step": 2936 }, { "epoch": 0.183625, "grad_norm": 3.015625, "grad_norm_var": 0.05384114583333333, "learning_rate": 0.0001, "loss": 8.2291, "loss/crossentropy": 2.241647481918335, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24874335527420044, "step": 2938 }, { "epoch": 0.18375, "grad_norm": 3.640625, "grad_norm_var": 0.0597076416015625, "learning_rate": 0.0001, "loss": 8.1807, "loss/crossentropy": 2.0755521059036255, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26138997077941895, "step": 2940 }, { "epoch": 0.183875, "grad_norm": 3.015625, "grad_norm_var": 0.05976155598958333, "learning_rate": 0.0001, "loss": 8.4186, "loss/crossentropy": 2.4679603576660156, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27902379631996155, "step": 2942 }, { "epoch": 0.184, "grad_norm": 3.0, "grad_norm_var": 0.05484619140625, "learning_rate": 0.0001, "loss": 8.1271, "loss/crossentropy": 2.0534247159957886, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26021403074264526, "step": 2944 }, { "epoch": 0.184125, "grad_norm": 3.015625, "grad_norm_var": 0.05119527180989583, "learning_rate": 0.0001, "loss": 8.2078, "loss/crossentropy": 2.2699111700057983, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26569822430610657, "step": 2946 }, { "epoch": 0.18425, "grad_norm": 2.828125, "grad_norm_var": 0.06213785807291667, "learning_rate": 0.0001, "loss": 8.0974, "loss/crossentropy": 2.420296549797058, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2546275109052658, "step": 2948 }, { "epoch": 0.184375, "grad_norm": 3.34375, "grad_norm_var": 0.04946187337239583, "learning_rate": 0.0001, "loss": 8.4089, "loss/crossentropy": 2.5957722663879395, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27632173895835876, "step": 2950 }, { "epoch": 0.1845, "grad_norm": 3.53125, "grad_norm_var": 0.05413004557291667, "learning_rate": 0.0001, "loss": 8.3382, "loss/crossentropy": 2.484778642654419, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27322784066200256, "step": 2952 }, { "epoch": 0.184625, "grad_norm": 3.078125, "grad_norm_var": 0.056473795572916666, "learning_rate": 0.0001, "loss": 8.0627, "loss/crossentropy": 2.148501753807068, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2559093087911606, "step": 2954 }, { "epoch": 0.18475, "grad_norm": 3.046875, "grad_norm_var": 0.040771484375, "learning_rate": 0.0001, "loss": 8.3083, "loss/crossentropy": 2.314954161643982, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.28318941593170166, "step": 2956 }, { "epoch": 0.184875, "grad_norm": 3.015625, "grad_norm_var": 0.03795166015625, "learning_rate": 0.0001, "loss": 8.1172, "loss/crossentropy": 2.366102695465088, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2760534882545471, "step": 2958 }, { "epoch": 0.185, "grad_norm": 3.171875, "grad_norm_var": 0.03756510416666667, "learning_rate": 0.0001, "loss": 8.2888, "loss/crossentropy": 2.487810730934143, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2672920525074005, "step": 2960 }, { "epoch": 0.185125, "grad_norm": 3.046875, "grad_norm_var": 0.036554972330729164, "learning_rate": 0.0001, "loss": 8.1165, "loss/crossentropy": 2.235316514968872, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25732211768627167, "step": 2962 }, { "epoch": 0.18525, "grad_norm": 2.890625, "grad_norm_var": 0.033600870768229166, "learning_rate": 0.0001, "loss": 8.2153, "loss/crossentropy": 2.688939690589905, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.31740468740463257, "step": 2964 }, { "epoch": 0.185375, "grad_norm": 2.921875, "grad_norm_var": 0.030248006184895832, "learning_rate": 0.0001, "loss": 8.0094, "loss/crossentropy": 2.5690836906433105, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2810027152299881, "step": 2966 }, { "epoch": 0.1855, "grad_norm": 2.953125, "grad_norm_var": 0.013817342122395833, "learning_rate": 0.0001, "loss": 7.8082, "loss/crossentropy": 2.1851229667663574, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2638121098279953, "step": 2968 }, { "epoch": 0.185625, "grad_norm": 3.71875, "grad_norm_var": 0.044266764322916666, "learning_rate": 0.0001, "loss": 8.3646, "loss/crossentropy": 2.1047242879867554, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3048202395439148, "step": 2970 }, { "epoch": 0.18575, "grad_norm": 2.984375, "grad_norm_var": 0.0408843994140625, "learning_rate": 0.0001, "loss": 8.2758, "loss/crossentropy": 2.1301698684692383, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24628406763076782, "step": 2972 }, { "epoch": 0.185875, "grad_norm": 2.890625, "grad_norm_var": 0.038361612955729166, "learning_rate": 0.0001, "loss": 8.2939, "loss/crossentropy": 2.433535099029541, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.29371021687984467, "step": 2974 }, { "epoch": 0.186, "grad_norm": 3.0, "grad_norm_var": 0.0372955322265625, "learning_rate": 0.0001, "loss": 8.1541, "loss/crossentropy": 2.1997573375701904, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2511989399790764, "step": 2976 }, { "epoch": 0.186125, "grad_norm": 2.9375, "grad_norm_var": 0.03798726399739583, "learning_rate": 0.0001, "loss": 8.4344, "loss/crossentropy": 2.4531192779541016, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2771553546190262, "step": 2978 }, { "epoch": 0.18625, "grad_norm": 3.15625, "grad_norm_var": 0.03854166666666667, "learning_rate": 0.0001, "loss": 8.0181, "loss/crossentropy": 2.2669776678085327, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25914834439754486, "step": 2980 }, { "epoch": 0.186375, "grad_norm": 2.890625, "grad_norm_var": 0.048563639322916664, "learning_rate": 0.0001, "loss": 8.2711, "loss/crossentropy": 2.322340726852417, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28330640494823456, "step": 2982 }, { "epoch": 0.1865, "grad_norm": 2.90625, "grad_norm_var": 0.049540201822916664, "learning_rate": 0.0001, "loss": 8.1752, "loss/crossentropy": 2.351397395133972, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24569445848464966, "step": 2984 }, { "epoch": 0.186625, "grad_norm": 2.875, "grad_norm_var": 0.02564697265625, "learning_rate": 0.0001, "loss": 8.1168, "loss/crossentropy": 2.547055959701538, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27938394248485565, "step": 2986 }, { "epoch": 0.18675, "grad_norm": 2.9375, "grad_norm_var": 0.026070149739583333, "learning_rate": 0.0001, "loss": 8.5189, "loss/crossentropy": 2.5092413425445557, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2599862962961197, "step": 2988 }, { "epoch": 0.186875, "grad_norm": 3.03125, "grad_norm_var": 0.029427083333333333, "learning_rate": 0.0001, "loss": 8.078, "loss/crossentropy": 2.412800908088684, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2972192317247391, "step": 2990 }, { "epoch": 0.187, "grad_norm": 3.28125, "grad_norm_var": 0.03290608723958333, "learning_rate": 0.0001, "loss": 8.4171, "loss/crossentropy": 2.5122843980789185, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2523321807384491, "step": 2992 }, { "epoch": 0.187125, "grad_norm": 3.140625, "grad_norm_var": 0.04843343098958333, "learning_rate": 0.0001, "loss": 8.2715, "loss/crossentropy": 2.3395785093307495, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2835071235895157, "step": 2994 }, { "epoch": 0.18725, "grad_norm": 3.71875, "grad_norm_var": 0.06779683430989583, "learning_rate": 0.0001, "loss": 8.4595, "loss/crossentropy": 2.5921707153320312, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.32498699426651, "step": 2996 }, { "epoch": 0.187375, "grad_norm": 3.453125, "grad_norm_var": 0.06386617024739584, "learning_rate": 0.0001, "loss": 8.0491, "loss/crossentropy": 2.3352322578430176, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2577902227640152, "step": 2998 }, { "epoch": 0.1875, "grad_norm": 3.28125, "grad_norm_var": 0.05859375, "learning_rate": 0.0001, "loss": 8.2248, "loss/crossentropy": 2.3004229068756104, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25653262436389923, "step": 3000 }, { "epoch": 0.187625, "grad_norm": 2.84375, "grad_norm_var": 0.0658203125, "learning_rate": 0.0001, "loss": 7.8124, "loss/crossentropy": 2.2351561784744263, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25118912756443024, "step": 3002 }, { "epoch": 0.18775, "grad_norm": 3.140625, "grad_norm_var": 0.06357320149739583, "learning_rate": 0.0001, "loss": 8.0213, "loss/crossentropy": 2.2484039068222046, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26490475982427597, "step": 3004 }, { "epoch": 0.187875, "grad_norm": 3.078125, "grad_norm_var": 0.06610921223958334, "learning_rate": 0.0001, "loss": 8.1035, "loss/crossentropy": 2.3076168298721313, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2600875496864319, "step": 3006 }, { "epoch": 0.188, "grad_norm": 2.9375, "grad_norm_var": 0.06648661295572916, "learning_rate": 0.0001, "loss": 8.2658, "loss/crossentropy": 2.5785369873046875, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2783546894788742, "step": 3008 }, { "epoch": 0.188125, "grad_norm": 3.1875, "grad_norm_var": 0.05540262858072917, "learning_rate": 0.0001, "loss": 8.3027, "loss/crossentropy": 2.3401262760162354, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2552775740623474, "step": 3010 }, { "epoch": 0.18825, "grad_norm": 2.890625, "grad_norm_var": 0.029296875, "learning_rate": 0.0001, "loss": 8.182, "loss/crossentropy": 2.122409999370575, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25811081379652023, "step": 3012 }, { "epoch": 0.188375, "grad_norm": 3.25, "grad_norm_var": 0.023368326822916667, "learning_rate": 0.0001, "loss": 8.0199, "loss/crossentropy": 2.427290201187134, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26948370039463043, "step": 3014 }, { "epoch": 0.1885, "grad_norm": 2.984375, "grad_norm_var": 0.0194732666015625, "learning_rate": 0.0001, "loss": 8.3842, "loss/crossentropy": 2.355746865272522, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2620701193809509, "step": 3016 }, { "epoch": 0.188625, "grad_norm": 2.90625, "grad_norm_var": 0.0181793212890625, "learning_rate": 0.0001, "loss": 7.9699, "loss/crossentropy": 2.35839581489563, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26620611548423767, "step": 3018 }, { "epoch": 0.18875, "grad_norm": 2.890625, "grad_norm_var": 0.019917805989583332, "learning_rate": 0.0001, "loss": 8.1946, "loss/crossentropy": 2.2129684686660767, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2607392221689224, "step": 3020 }, { "epoch": 0.188875, "grad_norm": 3.046875, "grad_norm_var": 0.019091796875, "learning_rate": 0.0001, "loss": 8.2211, "loss/crossentropy": 2.1944016218185425, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24655280262231827, "step": 3022 }, { "epoch": 0.189, "grad_norm": 2.875, "grad_norm_var": 0.0239166259765625, "learning_rate": 0.0001, "loss": 8.0436, "loss/crossentropy": 2.1268292665481567, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2602980434894562, "step": 3024 }, { "epoch": 0.189125, "grad_norm": 2.875, "grad_norm_var": 0.021512858072916665, "learning_rate": 0.0001, "loss": 8.108, "loss/crossentropy": 2.4339241981506348, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26776036620140076, "step": 3026 }, { "epoch": 0.18925, "grad_norm": 2.90625, "grad_norm_var": 0.022337849934895834, "learning_rate": 0.0001, "loss": 8.1682, "loss/crossentropy": 2.456682324409485, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27592889964580536, "step": 3028 }, { "epoch": 0.189375, "grad_norm": 2.78125, "grad_norm_var": 0.013630167643229166, "learning_rate": 0.0001, "loss": 7.9461, "loss/crossentropy": 2.3544520139694214, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25780095160007477, "step": 3030 }, { "epoch": 0.1895, "grad_norm": 3.09375, "grad_norm_var": 0.014557902018229167, "learning_rate": 0.0001, "loss": 8.1715, "loss/crossentropy": 2.363596200942993, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.29543258249759674, "step": 3032 }, { "epoch": 0.189625, "grad_norm": 3.484375, "grad_norm_var": 0.03345947265625, "learning_rate": 0.0001, "loss": 8.2126, "loss/crossentropy": 2.4506388902664185, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2730669379234314, "step": 3034 }, { "epoch": 0.18975, "grad_norm": 3.15625, "grad_norm_var": 0.06018778483072917, "learning_rate": 0.0001, "loss": 8.2254, "loss/crossentropy": 2.3496711254119873, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2570537030696869, "step": 3036 }, { "epoch": 0.189875, "grad_norm": 3.09375, "grad_norm_var": 0.061258951822916664, "learning_rate": 0.0001, "loss": 8.2739, "loss/crossentropy": 2.3058911561965942, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2748931795358658, "step": 3038 }, { "epoch": 0.19, "grad_norm": 2.90625, "grad_norm_var": 0.05432942708333333, "learning_rate": 0.0001, "loss": 8.2554, "loss/crossentropy": 2.1980836391448975, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26495523750782013, "step": 3040 }, { "epoch": 0.190125, "grad_norm": 3.421875, "grad_norm_var": 0.064501953125, "learning_rate": 0.0001, "loss": 8.255, "loss/crossentropy": 2.0824698209762573, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24418041110038757, "step": 3042 }, { "epoch": 0.19025, "grad_norm": 3.359375, "grad_norm_var": 0.07330729166666666, "learning_rate": 0.0001, "loss": 8.3164, "loss/crossentropy": 2.5387042760849, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2929651141166687, "step": 3044 }, { "epoch": 0.190375, "grad_norm": 2.734375, "grad_norm_var": 0.08557535807291666, "learning_rate": 0.0001, "loss": 7.973, "loss/crossentropy": 2.1048532724380493, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23939846456050873, "step": 3046 }, { "epoch": 0.1905, "grad_norm": 3.140625, "grad_norm_var": 0.08424072265625, "learning_rate": 0.0001, "loss": 8.0964, "loss/crossentropy": 2.418026924133301, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27147340774536133, "step": 3048 }, { "epoch": 0.190625, "grad_norm": 2.921875, "grad_norm_var": 0.07753499348958333, "learning_rate": 0.0001, "loss": 8.2169, "loss/crossentropy": 2.720233917236328, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26460620760917664, "step": 3050 }, { "epoch": 0.19075, "grad_norm": 3.203125, "grad_norm_var": 0.053511555989583334, "learning_rate": 0.0001, "loss": 8.2677, "loss/crossentropy": 2.3085511922836304, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2717476040124893, "step": 3052 }, { "epoch": 0.190875, "grad_norm": 3.03125, "grad_norm_var": 0.05455729166666667, "learning_rate": 0.0001, "loss": 8.2355, "loss/crossentropy": 2.387621521949768, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27137576043605804, "step": 3054 }, { "epoch": 0.191, "grad_norm": 3.0625, "grad_norm_var": 0.052708943684895836, "learning_rate": 0.0001, "loss": 8.2354, "loss/crossentropy": 2.2664815187454224, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27788451313972473, "step": 3056 }, { "epoch": 0.191125, "grad_norm": 2.90625, "grad_norm_var": 0.04145406087239583, "learning_rate": 0.0001, "loss": 8.1699, "loss/crossentropy": 2.19650661945343, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2952868938446045, "step": 3058 }, { "epoch": 0.19125, "grad_norm": 2.890625, "grad_norm_var": 0.023433430989583334, "learning_rate": 0.0001, "loss": 8.1568, "loss/crossentropy": 2.2924450635910034, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26796723902225494, "step": 3060 }, { "epoch": 0.191375, "grad_norm": 3.140625, "grad_norm_var": 0.011197916666666667, "learning_rate": 0.0001, "loss": 8.1256, "loss/crossentropy": 2.516156315803528, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.265813946723938, "step": 3062 }, { "epoch": 0.1915, "grad_norm": 3.046875, "grad_norm_var": 0.010570271809895834, "learning_rate": 0.0001, "loss": 8.2824, "loss/crossentropy": 2.3948129415512085, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.28667983412742615, "step": 3064 }, { "epoch": 0.191625, "grad_norm": 3.15625, "grad_norm_var": 0.010130818684895833, "learning_rate": 0.0001, "loss": 8.5698, "loss/crossentropy": 2.627001643180847, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.302081897854805, "step": 3066 }, { "epoch": 0.19175, "grad_norm": 3.3125, "grad_norm_var": 0.012906901041666667, "learning_rate": 0.0001, "loss": 8.2584, "loss/crossentropy": 2.6413527727127075, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27230359613895416, "step": 3068 }, { "epoch": 0.191875, "grad_norm": 3.03125, "grad_norm_var": 0.011091105143229167, "learning_rate": 0.0001, "loss": 8.2915, "loss/crossentropy": 2.3797744512557983, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25851966440677643, "step": 3070 }, { "epoch": 0.192, "grad_norm": 3.265625, "grad_norm_var": 0.013426717122395833, "learning_rate": 0.0001, "loss": 8.3002, "loss/crossentropy": 2.2201138734817505, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2628394812345505, "step": 3072 }, { "epoch": 0.192125, "grad_norm": 3.09375, "grad_norm_var": 0.012555948893229167, "learning_rate": 0.0001, "loss": 8.1419, "loss/crossentropy": 2.0795114636421204, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2654409408569336, "step": 3074 }, { "epoch": 0.19225, "grad_norm": 2.75, "grad_norm_var": 0.0164703369140625, "learning_rate": 0.0001, "loss": 7.996, "loss/crossentropy": 2.2035024166107178, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2541028633713722, "step": 3076 }, { "epoch": 0.192375, "grad_norm": 2.828125, "grad_norm_var": 0.02164306640625, "learning_rate": 0.0001, "loss": 8.1765, "loss/crossentropy": 2.4021114110946655, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2582816928625107, "step": 3078 }, { "epoch": 0.1925, "grad_norm": 2.953125, "grad_norm_var": 0.022679646809895832, "learning_rate": 0.0001, "loss": 7.9373, "loss/crossentropy": 2.2895009517669678, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.256507083773613, "step": 3080 }, { "epoch": 0.192625, "grad_norm": 2.8125, "grad_norm_var": 0.0277252197265625, "learning_rate": 0.0001, "loss": 7.8838, "loss/crossentropy": 2.027153968811035, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23617319762706757, "step": 3082 }, { "epoch": 0.19275, "grad_norm": 3.078125, "grad_norm_var": 0.0377105712890625, "learning_rate": 0.0001, "loss": 8.3783, "loss/crossentropy": 2.6372686624526978, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2389061003923416, "step": 3084 }, { "epoch": 0.192875, "grad_norm": 3.046875, "grad_norm_var": 0.037369791666666666, "learning_rate": 0.0001, "loss": 8.3173, "loss/crossentropy": 2.4116551876068115, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27836497128009796, "step": 3086 }, { "epoch": 0.193, "grad_norm": 3.046875, "grad_norm_var": 0.04472554524739583, "learning_rate": 0.0001, "loss": 8.0528, "loss/crossentropy": 2.435407042503357, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25427422672510147, "step": 3088 }, { "epoch": 0.193125, "grad_norm": 2.953125, "grad_norm_var": 0.042769368489583334, "learning_rate": 0.0001, "loss": 8.2524, "loss/crossentropy": 2.6872342824935913, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2897500991821289, "step": 3090 }, { "epoch": 0.19325, "grad_norm": 2.734375, "grad_norm_var": 0.04388020833333333, "learning_rate": 0.0001, "loss": 8.1072, "loss/crossentropy": 2.4015711545944214, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2563893646001816, "step": 3092 }, { "epoch": 0.193375, "grad_norm": 3.03125, "grad_norm_var": 0.04967041015625, "learning_rate": 0.0001, "loss": 7.8814, "loss/crossentropy": 2.3149009943008423, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.254148505628109, "step": 3094 }, { "epoch": 0.1935, "grad_norm": 2.671875, "grad_norm_var": 0.06373291015625, "learning_rate": 0.0001, "loss": 7.6511, "loss/crossentropy": 1.942514955997467, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2268311232328415, "step": 3096 }, { "epoch": 0.193625, "grad_norm": 3.15625, "grad_norm_var": 0.05998942057291667, "learning_rate": 0.0001, "loss": 8.1055, "loss/crossentropy": 2.512845039367676, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28133875131607056, "step": 3098 }, { "epoch": 0.19375, "grad_norm": 2.84375, "grad_norm_var": 0.047591145833333334, "learning_rate": 0.0001, "loss": 8.0487, "loss/crossentropy": 2.191560924053192, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24201743304729462, "step": 3100 }, { "epoch": 0.193875, "grad_norm": 3.65625, "grad_norm_var": 0.14207356770833332, "learning_rate": 0.0001, "loss": 8.2637, "loss/crossentropy": 2.3829805850982666, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2581256628036499, "step": 3102 }, { "epoch": 0.194, "grad_norm": 3.109375, "grad_norm_var": 0.13280843098958334, "learning_rate": 0.0001, "loss": 8.2235, "loss/crossentropy": 2.425102114677429, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2605956494808197, "step": 3104 }, { "epoch": 0.194125, "grad_norm": 3.15625, "grad_norm_var": 0.1308502197265625, "learning_rate": 0.0001, "loss": 8.2458, "loss/crossentropy": 2.274181604385376, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2674994617700577, "step": 3106 }, { "epoch": 0.19425, "grad_norm": 2.96875, "grad_norm_var": 0.12086181640625, "learning_rate": 0.0001, "loss": 8.123, "loss/crossentropy": 2.2365976572036743, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25523822009563446, "step": 3108 }, { "epoch": 0.194375, "grad_norm": 3.328125, "grad_norm_var": 0.11941731770833333, "learning_rate": 0.0001, "loss": 8.0437, "loss/crossentropy": 2.065447449684143, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2694360166788101, "step": 3110 }, { "epoch": 0.1945, "grad_norm": 2.921875, "grad_norm_var": 0.11741536458333333, "learning_rate": 0.0001, "loss": 8.3343, "loss/crossentropy": 2.4269362688064575, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25637270510196686, "step": 3112 }, { "epoch": 0.194625, "grad_norm": 3.0625, "grad_norm_var": 0.11825764973958333, "learning_rate": 0.0001, "loss": 8.2654, "loss/crossentropy": 2.4185571670532227, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26571086049079895, "step": 3114 }, { "epoch": 0.19475, "grad_norm": 3.03125, "grad_norm_var": 0.10660807291666667, "learning_rate": 0.0001, "loss": 7.9204, "loss/crossentropy": 2.3637081384658813, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.25697462260723114, "step": 3116 }, { "epoch": 0.194875, "grad_norm": 3.0, "grad_norm_var": 0.03491923014322917, "learning_rate": 0.0001, "loss": 8.1335, "loss/crossentropy": 2.3663218021392822, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2510784715414047, "step": 3118 }, { "epoch": 0.195, "grad_norm": 2.953125, "grad_norm_var": 0.03645731608072917, "learning_rate": 0.0001, "loss": 8.0906, "loss/crossentropy": 2.4295765161514282, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2629868686199188, "step": 3120 }, { "epoch": 0.195125, "grad_norm": 2.90625, "grad_norm_var": 0.03824462890625, "learning_rate": 0.0001, "loss": 8.1089, "loss/crossentropy": 2.334370255470276, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.25758758932352066, "step": 3122 }, { "epoch": 0.19525, "grad_norm": 2.8125, "grad_norm_var": 0.042313639322916666, "learning_rate": 0.0001, "loss": 8.0055, "loss/crossentropy": 2.156652331352234, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24994631111621857, "step": 3124 }, { "epoch": 0.195375, "grad_norm": 2.921875, "grad_norm_var": 0.0381744384765625, "learning_rate": 0.0001, "loss": 8.1386, "loss/crossentropy": 2.3902939558029175, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2595854699611664, "step": 3126 }, { "epoch": 0.1955, "grad_norm": 3.28125, "grad_norm_var": 0.01337890625, "learning_rate": 0.0001, "loss": 8.0796, "loss/crossentropy": 2.371825695037842, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26566849648952484, "step": 3128 }, { "epoch": 0.195625, "grad_norm": 2.828125, "grad_norm_var": 0.017235310872395833, "learning_rate": 0.0001, "loss": 8.0604, "loss/crossentropy": 2.0502688884735107, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24038948118686676, "step": 3130 }, { "epoch": 0.19575, "grad_norm": 3.125, "grad_norm_var": 0.01881103515625, "learning_rate": 0.0001, "loss": 8.1111, "loss/crossentropy": 2.6462838649749756, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27009811997413635, "step": 3132 }, { "epoch": 0.195875, "grad_norm": 3.15625, "grad_norm_var": 0.02281494140625, "learning_rate": 0.0001, "loss": 8.3116, "loss/crossentropy": 2.3181967735290527, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2602214217185974, "step": 3134 }, { "epoch": 0.196, "grad_norm": 3.171875, "grad_norm_var": 0.022526041666666666, "learning_rate": 0.0001, "loss": 8.3065, "loss/crossentropy": 2.4056873321533203, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26275748014450073, "step": 3136 }, { "epoch": 0.196125, "grad_norm": 3.359375, "grad_norm_var": 0.02760009765625, "learning_rate": 0.0001, "loss": 8.1514, "loss/crossentropy": 2.278952717781067, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2747359275817871, "step": 3138 }, { "epoch": 0.19625, "grad_norm": 2.90625, "grad_norm_var": 0.0237213134765625, "learning_rate": 0.0001, "loss": 8.1014, "loss/crossentropy": 2.417848587036133, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26167601346969604, "step": 3140 }, { "epoch": 0.196375, "grad_norm": 3.109375, "grad_norm_var": 0.0255279541015625, "learning_rate": 0.0001, "loss": 8.2388, "loss/crossentropy": 2.444987416267395, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25386446714401245, "step": 3142 }, { "epoch": 0.1965, "grad_norm": 3.046875, "grad_norm_var": 0.022705078125, "learning_rate": 0.0001, "loss": 8.2145, "loss/crossentropy": 2.581206440925598, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2710491418838501, "step": 3144 }, { "epoch": 0.196625, "grad_norm": 2.828125, "grad_norm_var": 0.0254058837890625, "learning_rate": 0.0001, "loss": 8.0613, "loss/crossentropy": 2.3687140941619873, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26861967146396637, "step": 3146 }, { "epoch": 0.19675, "grad_norm": 2.9375, "grad_norm_var": 0.07899983723958333, "learning_rate": 0.0001, "loss": 8.3689, "loss/crossentropy": 2.3609704971313477, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26762421429157257, "step": 3148 }, { "epoch": 0.196875, "grad_norm": 2.765625, "grad_norm_var": 0.08605855305989583, "learning_rate": 0.0001, "loss": 7.9722, "loss/crossentropy": 2.097190797328949, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24098625779151917, "step": 3150 }, { "epoch": 0.197, "grad_norm": 3.09375, "grad_norm_var": 0.0851226806640625, "learning_rate": 0.0001, "loss": 8.2039, "loss/crossentropy": 2.257096529006958, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2598903179168701, "step": 3152 }, { "epoch": 0.197125, "grad_norm": 2.875, "grad_norm_var": 0.08181050618489584, "learning_rate": 0.0001, "loss": 8.1718, "loss/crossentropy": 2.4441792964935303, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2703270763158798, "step": 3154 }, { "epoch": 0.19725, "grad_norm": 3.171875, "grad_norm_var": 0.0839996337890625, "learning_rate": 0.0001, "loss": 8.1738, "loss/crossentropy": 2.337058424949646, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25680898129940033, "step": 3156 }, { "epoch": 0.197375, "grad_norm": 2.96875, "grad_norm_var": 0.08325093587239583, "learning_rate": 0.0001, "loss": 8.1708, "loss/crossentropy": 2.3203121423721313, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25183166563510895, "step": 3158 }, { "epoch": 0.1975, "grad_norm": 2.875, "grad_norm_var": 0.08502604166666666, "learning_rate": 0.0001, "loss": 7.9677, "loss/crossentropy": 2.3680754899978638, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2431572675704956, "step": 3160 }, { "epoch": 0.197625, "grad_norm": 3.203125, "grad_norm_var": 0.08133036295572917, "learning_rate": 0.0001, "loss": 8.203, "loss/crossentropy": 2.241086721420288, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27713850140571594, "step": 3162 }, { "epoch": 0.19775, "grad_norm": 2.9375, "grad_norm_var": 0.025260416666666667, "learning_rate": 0.0001, "loss": 8.3118, "loss/crossentropy": 2.5448096990585327, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2678837478160858, "step": 3164 }, { "epoch": 0.197875, "grad_norm": 2.796875, "grad_norm_var": 0.022956339518229167, "learning_rate": 0.0001, "loss": 8.0958, "loss/crossentropy": 2.4013454914093018, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2578812763094902, "step": 3166 }, { "epoch": 0.198, "grad_norm": 3.109375, "grad_norm_var": 0.0243560791015625, "learning_rate": 0.0001, "loss": 8.3649, "loss/crossentropy": 2.399568200111389, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.3332839608192444, "step": 3168 }, { "epoch": 0.198125, "grad_norm": 2.78125, "grad_norm_var": 0.023726399739583334, "learning_rate": 0.0001, "loss": 7.9721, "loss/crossentropy": 2.496425151824951, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28880220651626587, "step": 3170 }, { "epoch": 0.19825, "grad_norm": 2.953125, "grad_norm_var": 0.020992024739583334, "learning_rate": 0.0001, "loss": 8.2263, "loss/crossentropy": 2.2559762001037598, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.263287752866745, "step": 3172 }, { "epoch": 0.198375, "grad_norm": 3.0, "grad_norm_var": 0.018294270833333334, "learning_rate": 0.0001, "loss": 8.2044, "loss/crossentropy": 2.4847277402877808, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26287899166345596, "step": 3174 }, { "epoch": 0.1985, "grad_norm": 3.109375, "grad_norm_var": 0.01822509765625, "learning_rate": 0.0001, "loss": 8.0561, "loss/crossentropy": 2.5184192657470703, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.274020254611969, "step": 3176 }, { "epoch": 0.198625, "grad_norm": 3.015625, "grad_norm_var": 0.01279296875, "learning_rate": 0.0001, "loss": 8.0441, "loss/crossentropy": 2.276871681213379, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23629513382911682, "step": 3178 }, { "epoch": 0.19875, "grad_norm": 3.203125, "grad_norm_var": 0.016014607747395833, "learning_rate": 0.0001, "loss": 8.1896, "loss/crossentropy": 2.236708164215088, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2376183420419693, "step": 3180 }, { "epoch": 0.198875, "grad_norm": 2.859375, "grad_norm_var": 0.0204010009765625, "learning_rate": 0.0001, "loss": 7.8184, "loss/crossentropy": 2.289997696876526, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26156531274318695, "step": 3182 }, { "epoch": 0.199, "grad_norm": 2.890625, "grad_norm_var": 0.0199859619140625, "learning_rate": 0.0001, "loss": 8.3089, "loss/crossentropy": 2.327690005302429, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2894662618637085, "step": 3184 }, { "epoch": 0.199125, "grad_norm": 3.203125, "grad_norm_var": 0.02037353515625, "learning_rate": 0.0001, "loss": 8.0743, "loss/crossentropy": 2.418406367301941, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25407615303993225, "step": 3186 }, { "epoch": 0.19925, "grad_norm": 2.8125, "grad_norm_var": 0.0223785400390625, "learning_rate": 0.0001, "loss": 8.0573, "loss/crossentropy": 2.243067741394043, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2523237615823746, "step": 3188 }, { "epoch": 0.199375, "grad_norm": 3.078125, "grad_norm_var": 0.027242024739583332, "learning_rate": 0.0001, "loss": 7.8769, "loss/crossentropy": 2.4334983825683594, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23196329176425934, "step": 3190 }, { "epoch": 0.1995, "grad_norm": 2.65625, "grad_norm_var": 0.035868326822916664, "learning_rate": 0.0001, "loss": 7.9538, "loss/crossentropy": 2.311228036880493, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2579493597149849, "step": 3192 }, { "epoch": 0.199625, "grad_norm": 3.140625, "grad_norm_var": 0.03538309733072917, "learning_rate": 0.0001, "loss": 8.2196, "loss/crossentropy": 2.431540369987488, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25777457654476166, "step": 3194 }, { "epoch": 0.19975, "grad_norm": 2.953125, "grad_norm_var": 0.03191731770833333, "learning_rate": 0.0001, "loss": 8.139, "loss/crossentropy": 2.1621546745300293, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2560386508703232, "step": 3196 }, { "epoch": 0.199875, "grad_norm": 2.828125, "grad_norm_var": 0.025712076822916666, "learning_rate": 0.0001, "loss": 7.97, "loss/crossentropy": 2.1923757791519165, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2589757889509201, "step": 3198 }, { "epoch": 0.2, "grad_norm": 2.875, "grad_norm_var": 0.0247467041015625, "learning_rate": 0.0001, "loss": 8.0392, "loss/crossentropy": 2.3970407247543335, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2541755437850952, "step": 3200 }, { "epoch": 0.200125, "grad_norm": 2.8125, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 8.0207, "loss/crossentropy": 2.2216036319732666, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26332978904247284, "step": 3202 }, { "epoch": 0.20025, "grad_norm": 2.875, "grad_norm_var": 0.02056884765625, "learning_rate": 0.0001, "loss": 8.2514, "loss/crossentropy": 2.2952345609664917, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24460511654615402, "step": 3204 }, { "epoch": 0.200375, "grad_norm": 3.171875, "grad_norm_var": 0.027497355143229166, "learning_rate": 0.0001, "loss": 8.3172, "loss/crossentropy": 2.3329302072525024, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.3068646192550659, "step": 3206 }, { "epoch": 0.2005, "grad_norm": 3.0, "grad_norm_var": 0.018464152018229166, "learning_rate": 0.0001, "loss": 8.0853, "loss/crossentropy": 2.4682726860046387, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2747759073972702, "step": 3208 }, { "epoch": 0.200625, "grad_norm": 2.90625, "grad_norm_var": 0.015555826822916667, "learning_rate": 0.0001, "loss": 8.084, "loss/crossentropy": 2.385040760040283, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2688244730234146, "step": 3210 }, { "epoch": 0.20075, "grad_norm": 2.984375, "grad_norm_var": 0.01578369140625, "learning_rate": 0.0001, "loss": 8.2028, "loss/crossentropy": 2.3609360456466675, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2741740494966507, "step": 3212 }, { "epoch": 0.200875, "grad_norm": 3.078125, "grad_norm_var": 0.015946451822916666, "learning_rate": 0.0001, "loss": 8.1162, "loss/crossentropy": 2.268782615661621, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25281913578510284, "step": 3214 }, { "epoch": 0.201, "grad_norm": 2.90625, "grad_norm_var": 0.0151519775390625, "learning_rate": 0.0001, "loss": 7.9198, "loss/crossentropy": 2.2365881204605103, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24488338828086853, "step": 3216 }, { "epoch": 0.201125, "grad_norm": 3.125, "grad_norm_var": 0.014134724934895834, "learning_rate": 0.0001, "loss": 8.0622, "loss/crossentropy": 2.333191156387329, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2846767157316208, "step": 3218 }, { "epoch": 0.20125, "grad_norm": 2.984375, "grad_norm_var": 0.014925130208333333, "learning_rate": 0.0001, "loss": 7.765, "loss/crossentropy": 2.2031763792037964, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2630941718816757, "step": 3220 }, { "epoch": 0.201375, "grad_norm": 2.953125, "grad_norm_var": 0.007515462239583334, "learning_rate": 0.0001, "loss": 7.8986, "loss/crossentropy": 2.228626847267151, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24427320063114166, "step": 3222 }, { "epoch": 0.2015, "grad_norm": 3.0625, "grad_norm_var": 0.007710774739583333, "learning_rate": 0.0001, "loss": 7.8828, "loss/crossentropy": 2.3146544694900513, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26063594222068787, "step": 3224 }, { "epoch": 0.201625, "grad_norm": 2.921875, "grad_norm_var": 0.0090484619140625, "learning_rate": 0.0001, "loss": 8.0466, "loss/crossentropy": 2.5943726301193237, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2648574709892273, "step": 3226 }, { "epoch": 0.20175, "grad_norm": 2.75, "grad_norm_var": 0.0123687744140625, "learning_rate": 0.0001, "loss": 8.1293, "loss/crossentropy": 2.481694221496582, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26219360530376434, "step": 3228 }, { "epoch": 0.201875, "grad_norm": 2.9375, "grad_norm_var": 0.010578409830729166, "learning_rate": 0.0001, "loss": 8.0384, "loss/crossentropy": 2.2695223093032837, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.27425651252269745, "step": 3230 }, { "epoch": 0.202, "grad_norm": 2.75, "grad_norm_var": 0.013736979166666666, "learning_rate": 0.0001, "loss": 8.0473, "loss/crossentropy": 2.2828463315963745, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25569504499435425, "step": 3232 }, { "epoch": 0.202125, "grad_norm": 3.0625, "grad_norm_var": 0.012450154622395833, "learning_rate": 0.0001, "loss": 8.0475, "loss/crossentropy": 2.402729630470276, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26215188205242157, "step": 3234 }, { "epoch": 0.20225, "grad_norm": 2.953125, "grad_norm_var": 0.0115142822265625, "learning_rate": 0.0001, "loss": 8.1686, "loss/crossentropy": 2.3639482259750366, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2587997019290924, "step": 3236 }, { "epoch": 0.202375, "grad_norm": 2.96875, "grad_norm_var": 0.012044270833333334, "learning_rate": 0.0001, "loss": 8.0236, "loss/crossentropy": 2.2139264345169067, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25370609760284424, "step": 3238 }, { "epoch": 0.2025, "grad_norm": 2.765625, "grad_norm_var": 0.01025390625, "learning_rate": 0.0001, "loss": 7.8858, "loss/crossentropy": 2.37356698513031, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2569102644920349, "step": 3240 }, { "epoch": 0.202625, "grad_norm": 3.09375, "grad_norm_var": 0.015013631184895833, "learning_rate": 0.0001, "loss": 8.0907, "loss/crossentropy": 2.1355791091918945, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2584020048379898, "step": 3242 }, { "epoch": 0.20275, "grad_norm": 2.890625, "grad_norm_var": 0.013963826497395833, "learning_rate": 0.0001, "loss": 7.9375, "loss/crossentropy": 2.1258978247642517, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2648678421974182, "step": 3244 }, { "epoch": 0.202875, "grad_norm": 2.9375, "grad_norm_var": 0.014383951822916666, "learning_rate": 0.0001, "loss": 8.0241, "loss/crossentropy": 2.2715872526168823, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2862909138202667, "step": 3246 }, { "epoch": 0.203, "grad_norm": 3.0, "grad_norm_var": 0.018473307291666668, "learning_rate": 0.0001, "loss": 8.1095, "loss/crossentropy": 2.333642363548279, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2708228975534439, "step": 3248 }, { "epoch": 0.203125, "grad_norm": 3.171875, "grad_norm_var": 0.0251953125, "learning_rate": 0.0001, "loss": 8.3236, "loss/crossentropy": 2.464186906814575, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2748962640762329, "step": 3250 }, { "epoch": 0.20325, "grad_norm": 3.09375, "grad_norm_var": 0.0252838134765625, "learning_rate": 0.0001, "loss": 8.289, "loss/crossentropy": 2.0890082120895386, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24715138971805573, "step": 3252 }, { "epoch": 0.203375, "grad_norm": 2.765625, "grad_norm_var": 0.0266998291015625, "learning_rate": 0.0001, "loss": 7.9402, "loss/crossentropy": 2.181369960308075, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26574352383613586, "step": 3254 }, { "epoch": 0.2035, "grad_norm": 3.140625, "grad_norm_var": 0.02701416015625, "learning_rate": 0.0001, "loss": 7.8531, "loss/crossentropy": 2.281398892402649, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2439633458852768, "step": 3256 }, { "epoch": 0.203625, "grad_norm": 2.953125, "grad_norm_var": 0.0259918212890625, "learning_rate": 0.0001, "loss": 7.9964, "loss/crossentropy": 2.1536207795143127, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24516429007053375, "step": 3258 }, { "epoch": 0.20375, "grad_norm": 3.15625, "grad_norm_var": 0.026155598958333335, "learning_rate": 0.0001, "loss": 8.0773, "loss/crossentropy": 2.070763051509857, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2518838942050934, "step": 3260 }, { "epoch": 0.203875, "grad_norm": 3.71875, "grad_norm_var": 0.05364176432291667, "learning_rate": 0.0001, "loss": 8.0644, "loss/crossentropy": 2.411260724067688, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2600061446428299, "step": 3262 }, { "epoch": 0.204, "grad_norm": 2.859375, "grad_norm_var": 0.05373942057291667, "learning_rate": 0.0001, "loss": 8.1634, "loss/crossentropy": 2.3903297185897827, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.248751699924469, "step": 3264 }, { "epoch": 0.204125, "grad_norm": 3.03125, "grad_norm_var": 0.052179972330729164, "learning_rate": 0.0001, "loss": 8.2074, "loss/crossentropy": 2.1826690435409546, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2685079574584961, "step": 3266 }, { "epoch": 0.20425, "grad_norm": 3.109375, "grad_norm_var": 0.05537007649739583, "learning_rate": 0.0001, "loss": 8.2012, "loss/crossentropy": 2.207249701023102, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2473110854625702, "step": 3268 }, { "epoch": 0.204375, "grad_norm": 3.015625, "grad_norm_var": 0.0469146728515625, "learning_rate": 0.0001, "loss": 7.9611, "loss/crossentropy": 2.4302347898483276, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24606862664222717, "step": 3270 }, { "epoch": 0.2045, "grad_norm": 2.890625, "grad_norm_var": 0.043309529622395836, "learning_rate": 0.0001, "loss": 8.1377, "loss/crossentropy": 2.538500189781189, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27223630249500275, "step": 3272 }, { "epoch": 0.204625, "grad_norm": 2.90625, "grad_norm_var": 0.049559529622395834, "learning_rate": 0.0001, "loss": 7.8854, "loss/crossentropy": 2.048095464706421, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24504344165325165, "step": 3274 }, { "epoch": 0.20475, "grad_norm": 2.8125, "grad_norm_var": 0.053076171875, "learning_rate": 0.0001, "loss": 7.9718, "loss/crossentropy": 2.0703362226486206, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2712066099047661, "step": 3276 }, { "epoch": 0.204875, "grad_norm": 3.78125, "grad_norm_var": 0.05878804524739583, "learning_rate": 0.0001, "loss": 8.423, "loss/crossentropy": 2.4606130123138428, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.31335532665252686, "step": 3278 }, { "epoch": 0.205, "grad_norm": 3.03125, "grad_norm_var": 0.05712890625, "learning_rate": 0.0001, "loss": 8.1598, "loss/crossentropy": 2.6250627040863037, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27594996988773346, "step": 3280 }, { "epoch": 0.205125, "grad_norm": 2.859375, "grad_norm_var": 0.056818644205729164, "learning_rate": 0.0001, "loss": 8.2019, "loss/crossentropy": 2.4209065437316895, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25507232546806335, "step": 3282 }, { "epoch": 0.20525, "grad_norm": 2.953125, "grad_norm_var": 0.05328369140625, "learning_rate": 0.0001, "loss": 8.1579, "loss/crossentropy": 2.3270163536071777, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24912425875663757, "step": 3284 }, { "epoch": 0.205375, "grad_norm": 3.015625, "grad_norm_var": 0.0549957275390625, "learning_rate": 0.0001, "loss": 7.9852, "loss/crossentropy": 2.4678841829299927, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23587578535079956, "step": 3286 }, { "epoch": 0.2055, "grad_norm": 3.0, "grad_norm_var": 0.051167805989583336, "learning_rate": 0.0001, "loss": 8.105, "loss/crossentropy": 2.420538544654846, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2402632236480713, "step": 3288 }, { "epoch": 0.205625, "grad_norm": 2.796875, "grad_norm_var": 0.05025634765625, "learning_rate": 0.0001, "loss": 8.0947, "loss/crossentropy": 2.5115219354629517, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.25969623029232025, "step": 3290 }, { "epoch": 0.20575, "grad_norm": 3.078125, "grad_norm_var": 0.04841206868489583, "learning_rate": 0.0001, "loss": 8.242, "loss/crossentropy": 2.2653130292892456, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2700078934431076, "step": 3292 }, { "epoch": 0.205875, "grad_norm": 3.5625, "grad_norm_var": 0.0318756103515625, "learning_rate": 0.0001, "loss": 8.3417, "loss/crossentropy": 2.481539011001587, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2482079267501831, "step": 3294 }, { "epoch": 0.206, "grad_norm": 2.875, "grad_norm_var": 0.056550089518229166, "learning_rate": 0.0001, "loss": 8.1301, "loss/crossentropy": 2.377937436103821, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.252384252846241, "step": 3296 }, { "epoch": 0.206125, "grad_norm": 2.984375, "grad_norm_var": 0.054488118489583334, "learning_rate": 0.0001, "loss": 8.2856, "loss/crossentropy": 2.3734689950942993, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25039682537317276, "step": 3298 }, { "epoch": 0.20625, "grad_norm": 2.84375, "grad_norm_var": 0.05681966145833333, "learning_rate": 0.0001, "loss": 7.9778, "loss/crossentropy": 2.237556576728821, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23857159167528152, "step": 3300 }, { "epoch": 0.206375, "grad_norm": 3.046875, "grad_norm_var": 0.0544342041015625, "learning_rate": 0.0001, "loss": 7.984, "loss/crossentropy": 2.286033868789673, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2503318265080452, "step": 3302 }, { "epoch": 0.2065, "grad_norm": 2.96875, "grad_norm_var": 0.06236063639322917, "learning_rate": 0.0001, "loss": 8.1663, "loss/crossentropy": 2.111438810825348, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.26578497141599655, "step": 3304 }, { "epoch": 0.206625, "grad_norm": 2.78125, "grad_norm_var": 0.06298421223958334, "learning_rate": 0.0001, "loss": 8.047, "loss/crossentropy": 2.0786361694335938, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2384149432182312, "step": 3306 }, { "epoch": 0.20675, "grad_norm": 2.9375, "grad_norm_var": 0.0692047119140625, "learning_rate": 0.0001, "loss": 7.8227, "loss/crossentropy": 2.0808927416801453, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2648046165704727, "step": 3308 }, { "epoch": 0.206875, "grad_norm": 2.9375, "grad_norm_var": 0.05110575358072917, "learning_rate": 0.0001, "loss": 8.0012, "loss/crossentropy": 2.1514230966567993, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24816033244132996, "step": 3310 }, { "epoch": 0.207, "grad_norm": 2.921875, "grad_norm_var": 0.023502604166666666, "learning_rate": 0.0001, "loss": 7.9675, "loss/crossentropy": 2.114220142364502, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.22722011804580688, "step": 3312 }, { "epoch": 0.207125, "grad_norm": 3.015625, "grad_norm_var": 0.026220703125, "learning_rate": 0.0001, "loss": 7.8712, "loss/crossentropy": 2.1372103095054626, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2645547240972519, "step": 3314 }, { "epoch": 0.20725, "grad_norm": 2.84375, "grad_norm_var": 0.0248931884765625, "learning_rate": 0.0001, "loss": 8.0588, "loss/crossentropy": 2.372753381729126, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25332552194595337, "step": 3316 }, { "epoch": 0.207375, "grad_norm": 3.0625, "grad_norm_var": 0.027049763997395834, "learning_rate": 0.0001, "loss": 8.1708, "loss/crossentropy": 2.311069369316101, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2605705112218857, "step": 3318 }, { "epoch": 0.2075, "grad_norm": 3.140625, "grad_norm_var": 0.017943318684895834, "learning_rate": 0.0001, "loss": 8.1425, "loss/crossentropy": 2.4386746883392334, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2544742077589035, "step": 3320 }, { "epoch": 0.207625, "grad_norm": 3.25, "grad_norm_var": 0.020750935872395834, "learning_rate": 0.0001, "loss": 7.968, "loss/crossentropy": 2.1591333150863647, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24341313540935516, "step": 3322 }, { "epoch": 0.20775, "grad_norm": 2.90625, "grad_norm_var": 0.018583170572916665, "learning_rate": 0.0001, "loss": 8.2134, "loss/crossentropy": 2.328689455986023, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.29594168066978455, "step": 3324 }, { "epoch": 0.207875, "grad_norm": 3.140625, "grad_norm_var": 0.023661295572916668, "learning_rate": 0.0001, "loss": 7.9217, "loss/crossentropy": 2.1437469720840454, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24640139937400818, "step": 3326 }, { "epoch": 0.208, "grad_norm": 3.125, "grad_norm_var": 0.0243560791015625, "learning_rate": 0.0001, "loss": 8.0628, "loss/crossentropy": 2.4024256467819214, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2733878195285797, "step": 3328 }, { "epoch": 0.208125, "grad_norm": 2.921875, "grad_norm_var": 0.025386555989583334, "learning_rate": 0.0001, "loss": 7.8516, "loss/crossentropy": 2.1901514530181885, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23804646730422974, "step": 3330 }, { "epoch": 0.20825, "grad_norm": 2.90625, "grad_norm_var": 0.024828084309895835, "learning_rate": 0.0001, "loss": 8.109, "loss/crossentropy": 2.291813850402832, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2448124885559082, "step": 3332 }, { "epoch": 0.208375, "grad_norm": 2.921875, "grad_norm_var": 0.025406901041666666, "learning_rate": 0.0001, "loss": 7.9948, "loss/crossentropy": 2.341397523880005, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27015161514282227, "step": 3334 }, { "epoch": 0.2085, "grad_norm": 3.140625, "grad_norm_var": 0.022761027018229168, "learning_rate": 0.0001, "loss": 8.1965, "loss/crossentropy": 2.3533178567886353, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25063496828079224, "step": 3336 }, { "epoch": 0.208625, "grad_norm": 2.65625, "grad_norm_var": 0.03072509765625, "learning_rate": 0.0001, "loss": 8.1318, "loss/crossentropy": 2.0992120504379272, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2553541362285614, "step": 3338 }, { "epoch": 0.20875, "grad_norm": 3.09375, "grad_norm_var": 0.033665974934895836, "learning_rate": 0.0001, "loss": 8.0182, "loss/crossentropy": 2.382994294166565, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.28810542821884155, "step": 3340 }, { "epoch": 0.208875, "grad_norm": 2.96875, "grad_norm_var": 0.027562459309895832, "learning_rate": 0.0001, "loss": 7.9908, "loss/crossentropy": 2.1971789598464966, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2596438080072403, "step": 3342 }, { "epoch": 0.209, "grad_norm": 3.234375, "grad_norm_var": 0.030029296875, "learning_rate": 0.0001, "loss": 7.7718, "loss/crossentropy": 2.0156781673431396, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.23752722889184952, "step": 3344 }, { "epoch": 0.209125, "grad_norm": 2.984375, "grad_norm_var": 0.026123046875, "learning_rate": 0.0001, "loss": 7.9132, "loss/crossentropy": 2.2873259782791138, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26303017139434814, "step": 3346 }, { "epoch": 0.20925, "grad_norm": 2.84375, "grad_norm_var": 0.026854451497395834, "learning_rate": 0.0001, "loss": 8.1614, "loss/crossentropy": 2.208059072494507, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25297391414642334, "step": 3348 }, { "epoch": 0.209375, "grad_norm": 3.046875, "grad_norm_var": 0.026432291666666666, "learning_rate": 0.0001, "loss": 8.2134, "loss/crossentropy": 2.664340019226074, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2778072953224182, "step": 3350 }, { "epoch": 0.2095, "grad_norm": 3.125, "grad_norm_var": 0.025951131184895834, "learning_rate": 0.0001, "loss": 7.9863, "loss/crossentropy": 2.352488398551941, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.280301034450531, "step": 3352 }, { "epoch": 0.209625, "grad_norm": 2.9375, "grad_norm_var": 0.013109334309895833, "learning_rate": 0.0001, "loss": 7.9529, "loss/crossentropy": 2.3037261962890625, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2568385750055313, "step": 3354 }, { "epoch": 0.20975, "grad_norm": 2.953125, "grad_norm_var": 0.009175618489583334, "learning_rate": 0.0001, "loss": 8.0492, "loss/crossentropy": 2.264007806777954, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2372257262468338, "step": 3356 }, { "epoch": 0.209875, "grad_norm": 2.875, "grad_norm_var": 0.043187459309895836, "learning_rate": 0.0001, "loss": 8.1985, "loss/crossentropy": 2.265676975250244, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.30991341173648834, "step": 3358 }, { "epoch": 0.21, "grad_norm": 3.203125, "grad_norm_var": 0.04296875, "learning_rate": 0.0001, "loss": 8.0834, "loss/crossentropy": 2.170192003250122, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2553609609603882, "step": 3360 }, { "epoch": 0.210125, "grad_norm": 2.96875, "grad_norm_var": 0.04361572265625, "learning_rate": 0.0001, "loss": 8.091, "loss/crossentropy": 2.4134016036987305, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2930755317211151, "step": 3362 }, { "epoch": 0.21025, "grad_norm": 3.796875, "grad_norm_var": 0.07550455729166666, "learning_rate": 0.0001, "loss": 8.1486, "loss/crossentropy": 2.3027660846710205, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2863481193780899, "step": 3364 }, { "epoch": 0.210375, "grad_norm": 2.90625, "grad_norm_var": 0.07553609212239583, "learning_rate": 0.0001, "loss": 7.9561, "loss/crossentropy": 2.3249882459640503, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25326602160930634, "step": 3366 }, { "epoch": 0.2105, "grad_norm": 2.953125, "grad_norm_var": 0.07991129557291667, "learning_rate": 0.0001, "loss": 8.1748, "loss/crossentropy": 2.429360032081604, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2706481069326401, "step": 3368 }, { "epoch": 0.210625, "grad_norm": 3.109375, "grad_norm_var": 0.08212890625, "learning_rate": 0.0001, "loss": 8.0211, "loss/crossentropy": 2.304826259613037, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2733375281095505, "step": 3370 }, { "epoch": 0.21075, "grad_norm": 3.453125, "grad_norm_var": 0.10440165201822917, "learning_rate": 0.0001, "loss": 8.5313, "loss/crossentropy": 2.6415737867355347, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.28427866101264954, "step": 3372 }, { "epoch": 0.210875, "grad_norm": 3.390625, "grad_norm_var": 0.08263346354166666, "learning_rate": 0.0001, "loss": 8.1368, "loss/crossentropy": 2.3548909425735474, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25943076610565186, "step": 3374 }, { "epoch": 0.211, "grad_norm": 2.984375, "grad_norm_var": 0.086669921875, "learning_rate": 0.0001, "loss": 8.057, "loss/crossentropy": 2.2008095383644104, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2705947160720825, "step": 3376 }, { "epoch": 0.211125, "grad_norm": 3.375, "grad_norm_var": 0.08528238932291667, "learning_rate": 0.0001, "loss": 8.1684, "loss/crossentropy": 2.1185187101364136, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23934519290924072, "step": 3378 }, { "epoch": 0.21125, "grad_norm": 2.890625, "grad_norm_var": 0.06855061848958334, "learning_rate": 0.0001, "loss": 7.9323, "loss/crossentropy": 2.183789014816284, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23662084341049194, "step": 3380 }, { "epoch": 0.211375, "grad_norm": 3.515625, "grad_norm_var": 0.07534077962239584, "learning_rate": 0.0001, "loss": 8.3151, "loss/crossentropy": 2.4148218631744385, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2555703818798065, "step": 3382 }, { "epoch": 0.2115, "grad_norm": 3.078125, "grad_norm_var": 0.06721089680989584, "learning_rate": 0.0001, "loss": 8.1862, "loss/crossentropy": 2.5065032243728638, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28839460015296936, "step": 3384 }, { "epoch": 0.211625, "grad_norm": 3.125, "grad_norm_var": 0.06750895182291666, "learning_rate": 0.0001, "loss": 8.0797, "loss/crossentropy": 2.2872482538223267, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2494898959994316, "step": 3386 }, { "epoch": 0.21175, "grad_norm": 2.734375, "grad_norm_var": 0.04421284993489583, "learning_rate": 0.0001, "loss": 8.0956, "loss/crossentropy": 2.249394178390503, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23356334120035172, "step": 3388 }, { "epoch": 0.211875, "grad_norm": 2.953125, "grad_norm_var": 0.04309488932291667, "learning_rate": 0.0001, "loss": 8.1253, "loss/crossentropy": 2.4072612524032593, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2705316096544266, "step": 3390 }, { "epoch": 0.212, "grad_norm": 3.015625, "grad_norm_var": 0.04153645833333333, "learning_rate": 0.0001, "loss": 8.1955, "loss/crossentropy": 2.291887044906616, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2566347047686577, "step": 3392 }, { "epoch": 0.212125, "grad_norm": 3.015625, "grad_norm_var": 0.03092041015625, "learning_rate": 0.0001, "loss": 8.1182, "loss/crossentropy": 2.4818975925445557, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27201250195503235, "step": 3394 }, { "epoch": 0.21225, "grad_norm": 2.9375, "grad_norm_var": 0.029426066080729167, "learning_rate": 0.0001, "loss": 8.1119, "loss/crossentropy": 2.3387409448623657, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26730673760175705, "step": 3396 }, { "epoch": 0.212375, "grad_norm": 3.125, "grad_norm_var": 0.014013671875, "learning_rate": 0.0001, "loss": 8.0371, "loss/crossentropy": 2.4898757934570312, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2796122133731842, "step": 3398 }, { "epoch": 0.2125, "grad_norm": 2.96875, "grad_norm_var": 0.0157623291015625, "learning_rate": 0.0001, "loss": 7.9214, "loss/crossentropy": 2.3467652797698975, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25767359137535095, "step": 3400 }, { "epoch": 0.212625, "grad_norm": 2.859375, "grad_norm_var": 0.014891560872395833, "learning_rate": 0.0001, "loss": 8.2623, "loss/crossentropy": 2.23625385761261, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2665071487426758, "step": 3402 }, { "epoch": 0.21275, "grad_norm": 2.90625, "grad_norm_var": 0.010221354166666667, "learning_rate": 0.0001, "loss": 8.0001, "loss/crossentropy": 2.247014105319977, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24841443449258804, "step": 3404 }, { "epoch": 0.212875, "grad_norm": 2.875, "grad_norm_var": 0.012886555989583333, "learning_rate": 0.0001, "loss": 8.0313, "loss/crossentropy": 2.3653587102890015, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.27540309727191925, "step": 3406 }, { "epoch": 0.213, "grad_norm": 3.171875, "grad_norm_var": 0.0152740478515625, "learning_rate": 0.0001, "loss": 8.2313, "loss/crossentropy": 2.2608449459075928, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2475418895483017, "step": 3408 }, { "epoch": 0.213125, "grad_norm": 2.875, "grad_norm_var": 0.0187896728515625, "learning_rate": 0.0001, "loss": 8.2868, "loss/crossentropy": 1.9663435816764832, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27278994023799896, "step": 3410 }, { "epoch": 0.21325, "grad_norm": 3.0, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 8.1122, "loss/crossentropy": 2.391019344329834, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25076402723789215, "step": 3412 }, { "epoch": 0.213375, "grad_norm": 2.84375, "grad_norm_var": 0.017210896809895834, "learning_rate": 0.0001, "loss": 7.9714, "loss/crossentropy": 2.4968901872634888, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.28580130636692047, "step": 3414 }, { "epoch": 0.2135, "grad_norm": 3.109375, "grad_norm_var": 0.019234212239583333, "learning_rate": 0.0001, "loss": 7.8824, "loss/crossentropy": 2.0557892322540283, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24151277542114258, "step": 3416 }, { "epoch": 0.213625, "grad_norm": 2.890625, "grad_norm_var": 0.019627888997395832, "learning_rate": 0.0001, "loss": 8.0965, "loss/crossentropy": 2.443650245666504, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25222641229629517, "step": 3418 }, { "epoch": 0.21375, "grad_norm": 2.90625, "grad_norm_var": 0.022419230143229166, "learning_rate": 0.0001, "loss": 8.2063, "loss/crossentropy": 2.750308632850647, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2700583189725876, "step": 3420 }, { "epoch": 0.213875, "grad_norm": 3.03125, "grad_norm_var": 0.01842041015625, "learning_rate": 0.0001, "loss": 8.051, "loss/crossentropy": 2.1019209027290344, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24724262952804565, "step": 3422 }, { "epoch": 0.214, "grad_norm": 2.875, "grad_norm_var": 0.015360514322916666, "learning_rate": 0.0001, "loss": 7.9942, "loss/crossentropy": 2.328813672065735, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2349339872598648, "step": 3424 }, { "epoch": 0.214125, "grad_norm": 2.671875, "grad_norm_var": 0.013932291666666667, "learning_rate": 0.0001, "loss": 7.8529, "loss/crossentropy": 2.2752292156219482, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25418810546398163, "step": 3426 }, { "epoch": 0.21425, "grad_norm": 2.96875, "grad_norm_var": 0.0168365478515625, "learning_rate": 0.0001, "loss": 7.9415, "loss/crossentropy": 2.260936737060547, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25729209184646606, "step": 3428 }, { "epoch": 0.214375, "grad_norm": 2.890625, "grad_norm_var": 0.0165191650390625, "learning_rate": 0.0001, "loss": 8.1508, "loss/crossentropy": 2.213426113128662, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2391301691532135, "step": 3430 }, { "epoch": 0.2145, "grad_norm": 2.96875, "grad_norm_var": 0.013020833333333334, "learning_rate": 0.0001, "loss": 8.0022, "loss/crossentropy": 2.320794105529785, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24946201592683792, "step": 3432 }, { "epoch": 0.214625, "grad_norm": 2.8125, "grad_norm_var": 0.0132476806640625, "learning_rate": 0.0001, "loss": 8.0841, "loss/crossentropy": 2.2941734790802, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24938707053661346, "step": 3434 }, { "epoch": 0.21475, "grad_norm": 3.03125, "grad_norm_var": 0.011551920572916667, "learning_rate": 0.0001, "loss": 8.1231, "loss/crossentropy": 2.1613428592681885, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.231092631816864, "step": 3436 }, { "epoch": 0.214875, "grad_norm": 2.765625, "grad_norm_var": 0.015067545572916667, "learning_rate": 0.0001, "loss": 7.8562, "loss/crossentropy": 2.3163094520568848, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24684642255306244, "step": 3438 }, { "epoch": 0.215, "grad_norm": 2.921875, "grad_norm_var": 0.014615885416666667, "learning_rate": 0.0001, "loss": 8.1419, "loss/crossentropy": 2.346468925476074, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2459520548582077, "step": 3440 }, { "epoch": 0.215125, "grad_norm": 3.078125, "grad_norm_var": 0.031473795572916664, "learning_rate": 0.0001, "loss": 7.9344, "loss/crossentropy": 2.299746036529541, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2519551143050194, "step": 3442 }, { "epoch": 0.21525, "grad_norm": 2.671875, "grad_norm_var": 0.03622639973958333, "learning_rate": 0.0001, "loss": 8.0242, "loss/crossentropy": 2.2825082540512085, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24382151663303375, "step": 3444 }, { "epoch": 0.215375, "grad_norm": 2.921875, "grad_norm_var": 0.036799112955729164, "learning_rate": 0.0001, "loss": 8.1511, "loss/crossentropy": 2.724141240119934, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26099613308906555, "step": 3446 }, { "epoch": 0.2155, "grad_norm": 2.75, "grad_norm_var": 0.041356404622395836, "learning_rate": 0.0001, "loss": 8.01, "loss/crossentropy": 2.4468116760253906, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2531883865594864, "step": 3448 }, { "epoch": 0.215625, "grad_norm": 3.109375, "grad_norm_var": 0.04243062337239583, "learning_rate": 0.0001, "loss": 8.3209, "loss/crossentropy": 2.35987651348114, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2925818860530853, "step": 3450 }, { "epoch": 0.21575, "grad_norm": 2.96875, "grad_norm_var": 0.04248758951822917, "learning_rate": 0.0001, "loss": 8.2423, "loss/crossentropy": 2.2506964802742004, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2657378166913986, "step": 3452 }, { "epoch": 0.215875, "grad_norm": 3.0, "grad_norm_var": 0.0366363525390625, "learning_rate": 0.0001, "loss": 7.8316, "loss/crossentropy": 2.4656643867492676, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2636348456144333, "step": 3454 }, { "epoch": 0.216, "grad_norm": 3.046875, "grad_norm_var": 0.03827718098958333, "learning_rate": 0.0001, "loss": 7.9689, "loss/crossentropy": 2.1773873567581177, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2358511984348297, "step": 3456 }, { "epoch": 0.216125, "grad_norm": 2.984375, "grad_norm_var": 0.019514973958333334, "learning_rate": 0.0001, "loss": 8.2903, "loss/crossentropy": 2.4382131099700928, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26277345418930054, "step": 3458 }, { "epoch": 0.21625, "grad_norm": 2.84375, "grad_norm_var": 0.028514607747395834, "learning_rate": 0.0001, "loss": 7.986, "loss/crossentropy": 2.137218475341797, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2463318258523941, "step": 3460 }, { "epoch": 0.216375, "grad_norm": 2.78125, "grad_norm_var": 0.03186442057291667, "learning_rate": 0.0001, "loss": 8.0746, "loss/crossentropy": 2.5652899742126465, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24091246724128723, "step": 3462 }, { "epoch": 0.2165, "grad_norm": 2.953125, "grad_norm_var": 0.0273101806640625, "learning_rate": 0.0001, "loss": 8.2663, "loss/crossentropy": 2.269050359725952, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24550612270832062, "step": 3464 }, { "epoch": 0.216625, "grad_norm": 3.109375, "grad_norm_var": 0.0274078369140625, "learning_rate": 0.0001, "loss": 8.1145, "loss/crossentropy": 2.1748660802841187, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2580345571041107, "step": 3466 }, { "epoch": 0.21675, "grad_norm": 2.75, "grad_norm_var": 0.03177083333333333, "learning_rate": 0.0001, "loss": 7.7683, "loss/crossentropy": 2.464895486831665, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26822274923324585, "step": 3468 }, { "epoch": 0.216875, "grad_norm": 3.015625, "grad_norm_var": 0.03211161295572917, "learning_rate": 0.0001, "loss": 8.0132, "loss/crossentropy": 2.516330361366272, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27802979946136475, "step": 3470 }, { "epoch": 0.217, "grad_norm": 2.953125, "grad_norm_var": 0.0298980712890625, "learning_rate": 0.0001, "loss": 8.0509, "loss/crossentropy": 2.593145251274109, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26215869188308716, "step": 3472 }, { "epoch": 0.217125, "grad_norm": 2.8125, "grad_norm_var": 0.029781087239583334, "learning_rate": 0.0001, "loss": 7.9716, "loss/crossentropy": 2.226263165473938, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26481927931308746, "step": 3474 }, { "epoch": 0.21725, "grad_norm": 2.953125, "grad_norm_var": 0.013939412434895833, "learning_rate": 0.0001, "loss": 7.9203, "loss/crossentropy": 2.1908382177352905, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24750825762748718, "step": 3476 }, { "epoch": 0.217375, "grad_norm": 2.75, "grad_norm_var": 0.018876139322916666, "learning_rate": 0.0001, "loss": 8.1505, "loss/crossentropy": 2.1081652641296387, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24459867179393768, "step": 3478 }, { "epoch": 0.2175, "grad_norm": 2.9375, "grad_norm_var": 0.019136555989583335, "learning_rate": 0.0001, "loss": 7.9132, "loss/crossentropy": 2.07977694272995, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2468918189406395, "step": 3480 }, { "epoch": 0.217625, "grad_norm": 2.765625, "grad_norm_var": 0.018778483072916668, "learning_rate": 0.0001, "loss": 8.036, "loss/crossentropy": 2.2569565773010254, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2362232357263565, "step": 3482 }, { "epoch": 0.21775, "grad_norm": 2.859375, "grad_norm_var": 0.016999308268229166, "learning_rate": 0.0001, "loss": 8.0578, "loss/crossentropy": 2.3093096017837524, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2618033364415169, "step": 3484 }, { "epoch": 0.217875, "grad_norm": 2.765625, "grad_norm_var": 0.018245442708333334, "learning_rate": 0.0001, "loss": 7.9073, "loss/crossentropy": 2.1073737144470215, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23664266616106033, "step": 3486 }, { "epoch": 0.218, "grad_norm": 11.875, "grad_norm_var": 7.317122395833334, "learning_rate": 0.0001, "loss": 8.6386, "loss/crossentropy": 2.440091371536255, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2737603783607483, "step": 3488 }, { "epoch": 0.218125, "grad_norm": 3.125, "grad_norm_var": 7.408426920572917, "learning_rate": 0.0001, "loss": 8.3179, "loss/crossentropy": 2.4426426887512207, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2780514657497406, "step": 3490 }, { "epoch": 0.21825, "grad_norm": 3.046875, "grad_norm_var": 7.375365193684896, "learning_rate": 0.0001, "loss": 8.0277, "loss/crossentropy": 2.196107029914856, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25197281688451767, "step": 3492 }, { "epoch": 0.218375, "grad_norm": 3.140625, "grad_norm_var": 7.33541259765625, "learning_rate": 0.0001, "loss": 8.0933, "loss/crossentropy": 2.404140591621399, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24803149700164795, "step": 3494 }, { "epoch": 0.2185, "grad_norm": 2.890625, "grad_norm_var": 7.330557250976563, "learning_rate": 0.0001, "loss": 7.9468, "loss/crossentropy": 2.261552095413208, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2754479944705963, "step": 3496 }, { "epoch": 0.218625, "grad_norm": 3.21875, "grad_norm_var": 7.257470703125, "learning_rate": 0.0001, "loss": 8.0821, "loss/crossentropy": 2.354046583175659, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2597518563270569, "step": 3498 }, { "epoch": 0.21875, "grad_norm": 6.40625, "grad_norm_var": 7.410123697916666, "learning_rate": 0.0001, "loss": 8.6704, "loss/crossentropy": 2.2524830102920532, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26117394119501114, "step": 3500 }, { "epoch": 0.218875, "grad_norm": 3.203125, "grad_norm_var": 7.230793253580729, "learning_rate": 0.0001, "loss": 8.2819, "loss/crossentropy": 2.433029532432556, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26937395334243774, "step": 3502 }, { "epoch": 0.219, "grad_norm": 3.125, "grad_norm_var": 1.079613240559896, "learning_rate": 0.0001, "loss": 8.0829, "loss/crossentropy": 2.180580735206604, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24801570177078247, "step": 3504 }, { "epoch": 0.219125, "grad_norm": 3.15625, "grad_norm_var": 0.7117472330729167, "learning_rate": 0.0001, "loss": 8.0049, "loss/crossentropy": 2.497164011001587, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2396157830953598, "step": 3506 }, { "epoch": 0.21925, "grad_norm": 3.046875, "grad_norm_var": 0.70777587890625, "learning_rate": 0.0001, "loss": 8.0631, "loss/crossentropy": 2.0866791009902954, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2625848799943924, "step": 3508 }, { "epoch": 0.219375, "grad_norm": 3.234375, "grad_norm_var": 0.70172119140625, "learning_rate": 0.0001, "loss": 8.0452, "loss/crossentropy": 2.189347505569458, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23898432403802872, "step": 3510 }, { "epoch": 0.2195, "grad_norm": 2.875, "grad_norm_var": 0.7069986979166667, "learning_rate": 0.0001, "loss": 8.1909, "loss/crossentropy": 2.3433092832565308, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2604905217885971, "step": 3512 }, { "epoch": 0.219625, "grad_norm": 2.796875, "grad_norm_var": 0.7274241129557292, "learning_rate": 0.0001, "loss": 7.9611, "loss/crossentropy": 2.4440083503723145, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2595008611679077, "step": 3514 }, { "epoch": 0.21975, "grad_norm": 3.09375, "grad_norm_var": 0.04013264973958333, "learning_rate": 0.0001, "loss": 8.2323, "loss/crossentropy": 2.4784871339797974, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2541923224925995, "step": 3516 }, { "epoch": 0.219875, "grad_norm": 3.109375, "grad_norm_var": 0.0164947509765625, "learning_rate": 0.0001, "loss": 7.9576, "loss/crossentropy": 2.083495259284973, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24065294116735458, "step": 3518 }, { "epoch": 0.22, "grad_norm": 3.0, "grad_norm_var": 0.015965779622395832, "learning_rate": 0.0001, "loss": 7.9361, "loss/crossentropy": 2.15146005153656, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24030451476573944, "step": 3520 }, { "epoch": 0.220125, "grad_norm": 3.03125, "grad_norm_var": 0.014582316080729166, "learning_rate": 0.0001, "loss": 8.107, "loss/crossentropy": 2.2797966599464417, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2358318492770195, "step": 3522 }, { "epoch": 0.22025, "grad_norm": 3.09375, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 8.0834, "loss/crossentropy": 2.270912528038025, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2688527777791023, "step": 3524 }, { "epoch": 0.220375, "grad_norm": 2.765625, "grad_norm_var": 0.021044921875, "learning_rate": 0.0001, "loss": 7.9002, "loss/crossentropy": 2.1109871864318848, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24296989291906357, "step": 3526 }, { "epoch": 0.2205, "grad_norm": 3.734375, "grad_norm_var": 0.0573883056640625, "learning_rate": 0.0001, "loss": 8.0665, "loss/crossentropy": 2.3173556327819824, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2529396265745163, "step": 3528 }, { "epoch": 0.220625, "grad_norm": 3.078125, "grad_norm_var": 0.05663655598958333, "learning_rate": 0.0001, "loss": 8.0593, "loss/crossentropy": 2.168402671813965, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26267802715301514, "step": 3530 }, { "epoch": 0.22075, "grad_norm": 2.828125, "grad_norm_var": 0.0636871337890625, "learning_rate": 0.0001, "loss": 8.2203, "loss/crossentropy": 2.3692712783813477, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2798602133989334, "step": 3532 }, { "epoch": 0.220875, "grad_norm": 2.578125, "grad_norm_var": 0.07527567545572916, "learning_rate": 0.0001, "loss": 7.9934, "loss/crossentropy": 2.441710114479065, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.264100581407547, "step": 3534 }, { "epoch": 0.221, "grad_norm": 2.84375, "grad_norm_var": 0.07697652180989584, "learning_rate": 0.0001, "loss": 7.914, "loss/crossentropy": 2.2376210689544678, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2386428266763687, "step": 3536 }, { "epoch": 0.221125, "grad_norm": 2.78125, "grad_norm_var": 0.08683268229166667, "learning_rate": 0.0001, "loss": 7.8692, "loss/crossentropy": 1.9457404017448425, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23729050904512405, "step": 3538 }, { "epoch": 0.22125, "grad_norm": 2.875, "grad_norm_var": 0.07551167805989584, "learning_rate": 0.0001, "loss": 7.9995, "loss/crossentropy": 2.2962979078292847, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2636758089065552, "step": 3540 }, { "epoch": 0.221375, "grad_norm": 2.890625, "grad_norm_var": 0.07558492024739584, "learning_rate": 0.0001, "loss": 7.9346, "loss/crossentropy": 2.275332808494568, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2384483963251114, "step": 3542 }, { "epoch": 0.2215, "grad_norm": 2.875, "grad_norm_var": 0.027839152018229167, "learning_rate": 0.0001, "loss": 7.6944, "loss/crossentropy": 2.0947870016098022, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.22623080015182495, "step": 3544 }, { "epoch": 0.221625, "grad_norm": 2.859375, "grad_norm_var": 0.025633748372395834, "learning_rate": 0.0001, "loss": 7.7702, "loss/crossentropy": 2.4045172929763794, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2514096647500992, "step": 3546 }, { "epoch": 0.22175, "grad_norm": 2.921875, "grad_norm_var": 0.0135894775390625, "learning_rate": 0.0001, "loss": 7.8777, "loss/crossentropy": 2.2690885066986084, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23157186806201935, "step": 3548 }, { "epoch": 0.221875, "grad_norm": 2.6875, "grad_norm_var": 0.02906494140625, "learning_rate": 0.0001, "loss": 8.1106, "loss/crossentropy": 2.4599485397338867, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.28053322434425354, "step": 3550 }, { "epoch": 0.222, "grad_norm": 2.8125, "grad_norm_var": 0.028880818684895834, "learning_rate": 0.0001, "loss": 7.9334, "loss/crossentropy": 2.15872859954834, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24894960224628448, "step": 3552 }, { "epoch": 0.222125, "grad_norm": 2.875, "grad_norm_var": 0.024486287434895834, "learning_rate": 0.0001, "loss": 7.8626, "loss/crossentropy": 2.3765709400177, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25589361786842346, "step": 3554 }, { "epoch": 0.22225, "grad_norm": 2.84375, "grad_norm_var": 0.0244293212890625, "learning_rate": 0.0001, "loss": 7.9958, "loss/crossentropy": 2.0382995009422302, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23948778212070465, "step": 3556 }, { "epoch": 0.222375, "grad_norm": 2.984375, "grad_norm_var": 0.022981770833333335, "learning_rate": 0.0001, "loss": 7.9355, "loss/crossentropy": 2.2267855405807495, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23568203300237656, "step": 3558 }, { "epoch": 0.2225, "grad_norm": 3.109375, "grad_norm_var": 0.025846354166666665, "learning_rate": 0.0001, "loss": 8.0736, "loss/crossentropy": 2.304799437522888, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23179854452610016, "step": 3560 }, { "epoch": 0.222625, "grad_norm": 2.8125, "grad_norm_var": 0.028206380208333333, "learning_rate": 0.0001, "loss": 8.1224, "loss/crossentropy": 2.3973569869995117, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.258654460310936, "step": 3562 }, { "epoch": 0.22275, "grad_norm": 2.875, "grad_norm_var": 0.027132161458333335, "learning_rate": 0.0001, "loss": 7.8143, "loss/crossentropy": 2.097848653793335, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23939958959817886, "step": 3564 }, { "epoch": 0.222875, "grad_norm": 3.234375, "grad_norm_var": 0.018822224934895833, "learning_rate": 0.0001, "loss": 7.9652, "loss/crossentropy": 2.5855683088302612, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2576940581202507, "step": 3566 }, { "epoch": 0.223, "grad_norm": 3.0, "grad_norm_var": 0.01822509765625, "learning_rate": 0.0001, "loss": 8.0738, "loss/crossentropy": 2.328821897506714, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25657960772514343, "step": 3568 }, { "epoch": 0.223125, "grad_norm": 2.828125, "grad_norm_var": 0.022997029622395835, "learning_rate": 0.0001, "loss": 8.0868, "loss/crossentropy": 2.537997841835022, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26182398200035095, "step": 3570 }, { "epoch": 0.22325, "grad_norm": 2.671875, "grad_norm_var": 0.027057902018229166, "learning_rate": 0.0001, "loss": 7.8351, "loss/crossentropy": 2.1498693227767944, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25888827443122864, "step": 3572 }, { "epoch": 0.223375, "grad_norm": 3.15625, "grad_norm_var": 0.03264058430989583, "learning_rate": 0.0001, "loss": 7.9163, "loss/crossentropy": 2.3035428524017334, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2618688642978668, "step": 3574 }, { "epoch": 0.2235, "grad_norm": 3.015625, "grad_norm_var": 0.03328348795572917, "learning_rate": 0.0001, "loss": 8.1395, "loss/crossentropy": 2.457966685295105, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2607010751962662, "step": 3576 }, { "epoch": 0.223625, "grad_norm": 3.234375, "grad_norm_var": 0.04112040201822917, "learning_rate": 0.0001, "loss": 8.1893, "loss/crossentropy": 2.220232129096985, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2506686598062515, "step": 3578 }, { "epoch": 0.22375, "grad_norm": 2.765625, "grad_norm_var": 0.0400787353515625, "learning_rate": 0.0001, "loss": 8.0102, "loss/crossentropy": 2.3712148666381836, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2428571730852127, "step": 3580 }, { "epoch": 0.223875, "grad_norm": 2.65625, "grad_norm_var": 0.04485270182291667, "learning_rate": 0.0001, "loss": 7.8827, "loss/crossentropy": 2.2805765867233276, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23625147342681885, "step": 3582 }, { "epoch": 0.224, "grad_norm": 2.78125, "grad_norm_var": 0.048371378580729166, "learning_rate": 0.0001, "loss": 8.0278, "loss/crossentropy": 2.3268240690231323, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2626963108778, "step": 3584 }, { "epoch": 0.224125, "grad_norm": 2.984375, "grad_norm_var": 0.04468485514322917, "learning_rate": 0.0001, "loss": 7.8847, "loss/crossentropy": 2.4000160694122314, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2558213621377945, "step": 3586 }, { "epoch": 0.22425, "grad_norm": 3.109375, "grad_norm_var": 0.0423492431640625, "learning_rate": 0.0001, "loss": 7.9188, "loss/crossentropy": 2.296359062194824, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24004730582237244, "step": 3588 }, { "epoch": 0.224375, "grad_norm": 2.984375, "grad_norm_var": 0.03669331868489583, "learning_rate": 0.0001, "loss": 7.9116, "loss/crossentropy": 2.4102399349212646, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24954771995544434, "step": 3590 }, { "epoch": 0.2245, "grad_norm": 2.78125, "grad_norm_var": 0.035660807291666666, "learning_rate": 0.0001, "loss": 7.9391, "loss/crossentropy": 2.33876371383667, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2509802132844925, "step": 3592 }, { "epoch": 0.224625, "grad_norm": 2.890625, "grad_norm_var": 0.0248046875, "learning_rate": 0.0001, "loss": 7.8352, "loss/crossentropy": 2.33649480342865, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2264355644583702, "step": 3594 }, { "epoch": 0.22475, "grad_norm": 3.140625, "grad_norm_var": 0.027718098958333333, "learning_rate": 0.0001, "loss": 8.0294, "loss/crossentropy": 2.255256175994873, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24728095531463623, "step": 3596 }, { "epoch": 0.224875, "grad_norm": 3.0, "grad_norm_var": 0.03717447916666667, "learning_rate": 0.0001, "loss": 8.3306, "loss/crossentropy": 2.1616681814193726, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.3028740808367729, "step": 3598 }, { "epoch": 0.225, "grad_norm": 3.203125, "grad_norm_var": 0.033665974934895836, "learning_rate": 0.0001, "loss": 8.1381, "loss/crossentropy": 2.1879382133483887, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24906174838542938, "step": 3600 }, { "epoch": 0.225125, "grad_norm": 2.71875, "grad_norm_var": 0.0383209228515625, "learning_rate": 0.0001, "loss": 8.2209, "loss/crossentropy": 2.551230788230896, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26665643602609634, "step": 3602 }, { "epoch": 0.22525, "grad_norm": 3.359375, "grad_norm_var": 0.04551493326822917, "learning_rate": 0.0001, "loss": 8.3031, "loss/crossentropy": 2.5071710348129272, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.342557817697525, "step": 3604 }, { "epoch": 0.225375, "grad_norm": 2.90625, "grad_norm_var": 0.047098795572916664, "learning_rate": 0.0001, "loss": 8.1181, "loss/crossentropy": 2.30819833278656, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26556289196014404, "step": 3606 }, { "epoch": 0.2255, "grad_norm": 2.890625, "grad_norm_var": 0.0525054931640625, "learning_rate": 0.0001, "loss": 7.7835, "loss/crossentropy": 2.093214511871338, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2401185780763626, "step": 3608 }, { "epoch": 0.225625, "grad_norm": 3.015625, "grad_norm_var": 0.047265625, "learning_rate": 0.0001, "loss": 8.0272, "loss/crossentropy": 2.3199344873428345, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2687990739941597, "step": 3610 }, { "epoch": 0.22575, "grad_norm": 3.21875, "grad_norm_var": 0.05095113118489583, "learning_rate": 0.0001, "loss": 8.0956, "loss/crossentropy": 2.178806185722351, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22464393079280853, "step": 3612 }, { "epoch": 0.225875, "grad_norm": 2.9375, "grad_norm_var": 0.0380035400390625, "learning_rate": 0.0001, "loss": 8.2848, "loss/crossentropy": 2.8881205320358276, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2964523136615753, "step": 3614 }, { "epoch": 0.226, "grad_norm": 3.5, "grad_norm_var": 0.0517486572265625, "learning_rate": 0.0001, "loss": 8.1152, "loss/crossentropy": 2.3062459230422974, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2544550150632858, "step": 3616 }, { "epoch": 0.226125, "grad_norm": 2.765625, "grad_norm_var": 0.050812784830729166, "learning_rate": 0.0001, "loss": 7.9502, "loss/crossentropy": 2.1567277312278748, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25140413641929626, "step": 3618 }, { "epoch": 0.22625, "grad_norm": 2.921875, "grad_norm_var": 0.04439697265625, "learning_rate": 0.0001, "loss": 7.8556, "loss/crossentropy": 2.313141703605652, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24740488827228546, "step": 3620 }, { "epoch": 0.226375, "grad_norm": 2.890625, "grad_norm_var": 0.04332275390625, "learning_rate": 0.0001, "loss": 8.0423, "loss/crossentropy": 2.328591823577881, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26608438789844513, "step": 3622 }, { "epoch": 0.2265, "grad_norm": 3.125, "grad_norm_var": 0.0403472900390625, "learning_rate": 0.0001, "loss": 8.0185, "loss/crossentropy": 2.4062294960021973, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25196973979473114, "step": 3624 }, { "epoch": 0.226625, "grad_norm": 3.046875, "grad_norm_var": 0.03852437337239583, "learning_rate": 0.0001, "loss": 8.2194, "loss/crossentropy": 2.3326817750930786, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25500936061143875, "step": 3626 }, { "epoch": 0.22675, "grad_norm": 2.84375, "grad_norm_var": 0.039876302083333336, "learning_rate": 0.0001, "loss": 8.0569, "loss/crossentropy": 2.482856869697571, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25555209815502167, "step": 3628 }, { "epoch": 0.226875, "grad_norm": 3.53125, "grad_norm_var": 0.10530598958333333, "learning_rate": 0.0001, "loss": 8.2217, "loss/crossentropy": 2.392806649208069, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2713817358016968, "step": 3630 }, { "epoch": 0.227, "grad_norm": 2.96875, "grad_norm_var": 0.0933746337890625, "learning_rate": 0.0001, "loss": 8.2214, "loss/crossentropy": 2.4297484159469604, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26915179938077927, "step": 3632 }, { "epoch": 0.227125, "grad_norm": 2.96875, "grad_norm_var": 0.09046223958333334, "learning_rate": 0.0001, "loss": 7.892, "loss/crossentropy": 2.0668236017227173, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24264214932918549, "step": 3634 }, { "epoch": 0.22725, "grad_norm": 2.8125, "grad_norm_var": 0.0952056884765625, "learning_rate": 0.0001, "loss": 7.6419, "loss/crossentropy": 2.2886130809783936, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2411990761756897, "step": 3636 }, { "epoch": 0.227375, "grad_norm": 2.765625, "grad_norm_var": 0.1007476806640625, "learning_rate": 0.0001, "loss": 7.8926, "loss/crossentropy": 2.4050437211990356, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.248046413064003, "step": 3638 }, { "epoch": 0.2275, "grad_norm": 3.015625, "grad_norm_var": 0.0963775634765625, "learning_rate": 0.0001, "loss": 7.9465, "loss/crossentropy": 2.237168073654175, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2606538310647011, "step": 3640 }, { "epoch": 0.227625, "grad_norm": 2.96875, "grad_norm_var": 0.09270426432291666, "learning_rate": 0.0001, "loss": 8.0894, "loss/crossentropy": 2.1759636998176575, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25069190561771393, "step": 3642 }, { "epoch": 0.22775, "grad_norm": 2.75, "grad_norm_var": 0.09306640625, "learning_rate": 0.0001, "loss": 7.8278, "loss/crossentropy": 2.33109974861145, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24534446746110916, "step": 3644 }, { "epoch": 0.227875, "grad_norm": 2.8125, "grad_norm_var": 0.010319010416666666, "learning_rate": 0.0001, "loss": 7.9532, "loss/crossentropy": 2.209414005279541, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.23883101344108582, "step": 3646 }, { "epoch": 0.228, "grad_norm": 2.9375, "grad_norm_var": 0.009993489583333333, "learning_rate": 0.0001, "loss": 8.1503, "loss/crossentropy": 2.3681305646896362, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2580679953098297, "step": 3648 }, { "epoch": 0.228125, "grad_norm": 2.890625, "grad_norm_var": 0.011913045247395834, "learning_rate": 0.0001, "loss": 7.9858, "loss/crossentropy": 2.2001166343688965, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2430475950241089, "step": 3650 }, { "epoch": 0.22825, "grad_norm": 3.0, "grad_norm_var": 0.011800130208333334, "learning_rate": 0.0001, "loss": 7.9813, "loss/crossentropy": 2.2274385690689087, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23911988735198975, "step": 3652 }, { "epoch": 0.228375, "grad_norm": 2.84375, "grad_norm_var": 0.010380045572916666, "learning_rate": 0.0001, "loss": 8.0302, "loss/crossentropy": 2.230368971824646, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24802518635988235, "step": 3654 }, { "epoch": 0.2285, "grad_norm": 2.6875, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 7.7304, "loss/crossentropy": 2.195298194885254, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23306956887245178, "step": 3656 }, { "epoch": 0.228625, "grad_norm": 2.90625, "grad_norm_var": 0.014061482747395833, "learning_rate": 0.0001, "loss": 8.0813, "loss/crossentropy": 2.1692891120910645, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24116355180740356, "step": 3658 }, { "epoch": 0.22875, "grad_norm": 3.125, "grad_norm_var": 0.017292277018229166, "learning_rate": 0.0001, "loss": 8.1278, "loss/crossentropy": 2.3059465885162354, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24064010381698608, "step": 3660 }, { "epoch": 0.228875, "grad_norm": 2.90625, "grad_norm_var": 0.018192545572916666, "learning_rate": 0.0001, "loss": 7.9778, "loss/crossentropy": 2.399103045463562, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23807506263256073, "step": 3662 }, { "epoch": 0.229, "grad_norm": 2.828125, "grad_norm_var": 0.01832275390625, "learning_rate": 0.0001, "loss": 8.1474, "loss/crossentropy": 2.3898919820785522, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25592923909425735, "step": 3664 }, { "epoch": 0.229125, "grad_norm": 2.71875, "grad_norm_var": 0.0172515869140625, "learning_rate": 0.0001, "loss": 7.8636, "loss/crossentropy": 2.329254150390625, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2536974400281906, "step": 3666 }, { "epoch": 0.22925, "grad_norm": 2.65625, "grad_norm_var": 0.018648274739583335, "learning_rate": 0.0001, "loss": 7.8664, "loss/crossentropy": 2.1764838695526123, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2520892173051834, "step": 3668 }, { "epoch": 0.229375, "grad_norm": 2.78125, "grad_norm_var": 0.027025349934895835, "learning_rate": 0.0001, "loss": 7.9033, "loss/crossentropy": 2.326913833618164, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.22852397710084915, "step": 3670 }, { "epoch": 0.2295, "grad_norm": 2.8125, "grad_norm_var": 0.025251261393229165, "learning_rate": 0.0001, "loss": 8.1325, "loss/crossentropy": 2.5875282287597656, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25728514790534973, "step": 3672 }, { "epoch": 0.229625, "grad_norm": 3.125, "grad_norm_var": 0.025536092122395833, "learning_rate": 0.0001, "loss": 8.1942, "loss/crossentropy": 2.4060288667678833, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2609986811876297, "step": 3674 }, { "epoch": 0.22975, "grad_norm": 2.859375, "grad_norm_var": 0.021361287434895834, "learning_rate": 0.0001, "loss": 7.9882, "loss/crossentropy": 2.400329113006592, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2339789718389511, "step": 3676 }, { "epoch": 0.229875, "grad_norm": 2.828125, "grad_norm_var": 0.021223958333333334, "learning_rate": 0.0001, "loss": 7.7641, "loss/crossentropy": 2.1139498949050903, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24976061284542084, "step": 3678 }, { "epoch": 0.23, "grad_norm": 2.875, "grad_norm_var": 0.021142578125, "learning_rate": 0.0001, "loss": 8.0759, "loss/crossentropy": 2.3943647146224976, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2582573890686035, "step": 3680 }, { "epoch": 0.230125, "grad_norm": 3.046875, "grad_norm_var": 0.02203369140625, "learning_rate": 0.0001, "loss": 8.1859, "loss/crossentropy": 2.6016229391098022, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2739368677139282, "step": 3682 }, { "epoch": 0.23025, "grad_norm": 2.6875, "grad_norm_var": 0.020796712239583334, "learning_rate": 0.0001, "loss": 7.8159, "loss/crossentropy": 2.18959903717041, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24328485131263733, "step": 3684 }, { "epoch": 0.230375, "grad_norm": 3.109375, "grad_norm_var": 0.017704264322916666, "learning_rate": 0.0001, "loss": 8.11, "loss/crossentropy": 2.5320764780044556, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24227013438940048, "step": 3686 }, { "epoch": 0.2305, "grad_norm": 2.734375, "grad_norm_var": 0.018512980143229166, "learning_rate": 0.0001, "loss": 7.8237, "loss/crossentropy": 2.3462241888046265, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25317418575286865, "step": 3688 }, { "epoch": 0.230625, "grad_norm": 2.8125, "grad_norm_var": 0.016243489583333333, "learning_rate": 0.0001, "loss": 7.8874, "loss/crossentropy": 2.1772103309631348, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2475140392780304, "step": 3690 }, { "epoch": 0.23075, "grad_norm": 2.703125, "grad_norm_var": 0.0204254150390625, "learning_rate": 0.0001, "loss": 8.0884, "loss/crossentropy": 2.16398286819458, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26599203050136566, "step": 3692 }, { "epoch": 0.230875, "grad_norm": 2.90625, "grad_norm_var": 0.018648274739583335, "learning_rate": 0.0001, "loss": 7.8644, "loss/crossentropy": 2.2632850408554077, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24153126776218414, "step": 3694 }, { "epoch": 0.231, "grad_norm": 2.921875, "grad_norm_var": 0.019066365559895833, "learning_rate": 0.0001, "loss": 8.0305, "loss/crossentropy": 2.3470261096954346, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24013527482748032, "step": 3696 }, { "epoch": 0.231125, "grad_norm": 3.015625, "grad_norm_var": 0.018748982747395834, "learning_rate": 0.0001, "loss": 7.8235, "loss/crossentropy": 2.21561336517334, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2327084168791771, "step": 3698 }, { "epoch": 0.23125, "grad_norm": 2.78125, "grad_norm_var": 0.0171783447265625, "learning_rate": 0.0001, "loss": 7.9991, "loss/crossentropy": 2.2310367822647095, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.242506742477417, "step": 3700 }, { "epoch": 0.231375, "grad_norm": 3.078125, "grad_norm_var": 0.016852823893229167, "learning_rate": 0.0001, "loss": 8.1388, "loss/crossentropy": 2.515184164047241, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2872355580329895, "step": 3702 }, { "epoch": 0.2315, "grad_norm": 2.640625, "grad_norm_var": 0.019782511393229167, "learning_rate": 0.0001, "loss": 7.9631, "loss/crossentropy": 2.2909129858016968, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23802363872528076, "step": 3704 }, { "epoch": 0.231625, "grad_norm": 2.875, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 7.8775, "loss/crossentropy": 2.387066602706909, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24512099474668503, "step": 3706 }, { "epoch": 0.23175, "grad_norm": 2.765625, "grad_norm_var": 0.014404296875, "learning_rate": 0.0001, "loss": 7.8625, "loss/crossentropy": 2.254941940307617, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2513599842786789, "step": 3708 }, { "epoch": 0.231875, "grad_norm": 2.828125, "grad_norm_var": 0.013765462239583333, "learning_rate": 0.0001, "loss": 7.8971, "loss/crossentropy": 2.142080545425415, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2329133376479149, "step": 3710 }, { "epoch": 0.232, "grad_norm": 2.734375, "grad_norm_var": 0.017878214518229168, "learning_rate": 0.0001, "loss": 7.7952, "loss/crossentropy": 2.3408135175704956, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.245710588991642, "step": 3712 }, { "epoch": 0.232125, "grad_norm": 2.75, "grad_norm_var": 0.016499837239583332, "learning_rate": 0.0001, "loss": 7.9523, "loss/crossentropy": 2.4113335609436035, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23740407824516296, "step": 3714 }, { "epoch": 0.23225, "grad_norm": 2.9375, "grad_norm_var": 0.020637003580729167, "learning_rate": 0.0001, "loss": 8.2332, "loss/crossentropy": 2.397470474243164, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.3087628036737442, "step": 3716 }, { "epoch": 0.232375, "grad_norm": 2.734375, "grad_norm_var": 0.017769368489583333, "learning_rate": 0.0001, "loss": 8.0103, "loss/crossentropy": 2.3515301942825317, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24831266701221466, "step": 3718 }, { "epoch": 0.2325, "grad_norm": 2.9375, "grad_norm_var": 0.014957682291666666, "learning_rate": 0.0001, "loss": 7.916, "loss/crossentropy": 2.222606897354126, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24274057149887085, "step": 3720 }, { "epoch": 0.232625, "grad_norm": 3.125, "grad_norm_var": 0.019331868489583334, "learning_rate": 0.0001, "loss": 8.1821, "loss/crossentropy": 2.3737378120422363, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24392034113407135, "step": 3722 }, { "epoch": 0.23275, "grad_norm": 3.140625, "grad_norm_var": 0.026953125, "learning_rate": 0.0001, "loss": 8.0112, "loss/crossentropy": 2.5040173530578613, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26996733248233795, "step": 3724 }, { "epoch": 0.232875, "grad_norm": 2.71875, "grad_norm_var": 0.028645833333333332, "learning_rate": 0.0001, "loss": 7.9624, "loss/crossentropy": 2.274856448173523, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26013311743736267, "step": 3726 }, { "epoch": 0.233, "grad_norm": 2.734375, "grad_norm_var": 0.028172810872395832, "learning_rate": 0.0001, "loss": 7.9561, "loss/crossentropy": 2.2882769107818604, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23719671368598938, "step": 3728 }, { "epoch": 0.233125, "grad_norm": 2.859375, "grad_norm_var": 0.0279449462890625, "learning_rate": 0.0001, "loss": 7.9788, "loss/crossentropy": 2.222030520439148, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2667320519685745, "step": 3730 }, { "epoch": 0.23325, "grad_norm": 2.890625, "grad_norm_var": 0.0249664306640625, "learning_rate": 0.0001, "loss": 7.8122, "loss/crossentropy": 2.127245843410492, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2510572522878647, "step": 3732 }, { "epoch": 0.233375, "grad_norm": 2.84375, "grad_norm_var": 0.025081380208333334, "learning_rate": 0.0001, "loss": 7.8065, "loss/crossentropy": 2.412429094314575, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2435067892074585, "step": 3734 }, { "epoch": 0.2335, "grad_norm": 2.90625, "grad_norm_var": 0.025406901041666666, "learning_rate": 0.0001, "loss": 7.9871, "loss/crossentropy": 2.1098079085350037, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.28457003831863403, "step": 3736 }, { "epoch": 0.233625, "grad_norm": 3.09375, "grad_norm_var": 0.027164713541666666, "learning_rate": 0.0001, "loss": 7.9466, "loss/crossentropy": 2.1928519010543823, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25453677773475647, "step": 3738 }, { "epoch": 0.23375, "grad_norm": 2.75, "grad_norm_var": 0.0246978759765625, "learning_rate": 0.0001, "loss": 8.0324, "loss/crossentropy": 2.5346169471740723, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2512079030275345, "step": 3740 }, { "epoch": 0.233875, "grad_norm": 2.75, "grad_norm_var": 0.025716145833333332, "learning_rate": 0.0001, "loss": 8.0987, "loss/crossentropy": 2.3410524129867554, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2596224248409271, "step": 3742 }, { "epoch": 0.234, "grad_norm": 2.859375, "grad_norm_var": 0.022261555989583334, "learning_rate": 0.0001, "loss": 7.7984, "loss/crossentropy": 2.353347420692444, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26650838553905487, "step": 3744 }, { "epoch": 0.234125, "grad_norm": 2.90625, "grad_norm_var": 0.025519816080729167, "learning_rate": 0.0001, "loss": 8.1373, "loss/crossentropy": 2.4095019102096558, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26190927624702454, "step": 3746 }, { "epoch": 0.23425, "grad_norm": 2.734375, "grad_norm_var": 0.028937784830729167, "learning_rate": 0.0001, "loss": 7.6916, "loss/crossentropy": 2.0359702110290527, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2307998687028885, "step": 3748 }, { "epoch": 0.234375, "grad_norm": 3.015625, "grad_norm_var": 0.028999837239583333, "learning_rate": 0.0001, "loss": 7.792, "loss/crossentropy": 2.168085813522339, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23495958745479584, "step": 3750 }, { "epoch": 0.2345, "grad_norm": 3.265625, "grad_norm_var": 0.04117431640625, "learning_rate": 0.0001, "loss": 7.9602, "loss/crossentropy": 2.026396870613098, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2430543154478073, "step": 3752 }, { "epoch": 0.234625, "grad_norm": 3.015625, "grad_norm_var": 0.037694295247395836, "learning_rate": 0.0001, "loss": 7.9993, "loss/crossentropy": 2.4510369300842285, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27556227147579193, "step": 3754 }, { "epoch": 0.23475, "grad_norm": 2.875, "grad_norm_var": 0.03394266764322917, "learning_rate": 0.0001, "loss": 7.9829, "loss/crossentropy": 2.0846810340881348, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.235876202583313, "step": 3756 }, { "epoch": 0.234875, "grad_norm": 2.84375, "grad_norm_var": 0.031266276041666666, "learning_rate": 0.0001, "loss": 7.8668, "loss/crossentropy": 2.279644012451172, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.250064454972744, "step": 3758 }, { "epoch": 0.235, "grad_norm": 2.765625, "grad_norm_var": 0.0311431884765625, "learning_rate": 0.0001, "loss": 7.9483, "loss/crossentropy": 2.1563061475753784, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.258032388985157, "step": 3760 }, { "epoch": 0.235125, "grad_norm": 2.65625, "grad_norm_var": 0.031538899739583334, "learning_rate": 0.0001, "loss": 7.8565, "loss/crossentropy": 2.071030616760254, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23593812435865402, "step": 3762 }, { "epoch": 0.23525, "grad_norm": 3.0, "grad_norm_var": 0.027887980143229168, "learning_rate": 0.0001, "loss": 8.0189, "loss/crossentropy": 2.3263286352157593, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25163403153419495, "step": 3764 }, { "epoch": 0.235375, "grad_norm": 2.75, "grad_norm_var": 0.02740478515625, "learning_rate": 0.0001, "loss": 7.7014, "loss/crossentropy": 2.456760048866272, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23420259356498718, "step": 3766 }, { "epoch": 0.2355, "grad_norm": 2.953125, "grad_norm_var": 0.012044270833333334, "learning_rate": 0.0001, "loss": 8.0371, "loss/crossentropy": 2.369943618774414, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2603626102209091, "step": 3768 }, { "epoch": 0.235625, "grad_norm": 2.9375, "grad_norm_var": 0.010106404622395834, "learning_rate": 0.0001, "loss": 8.1286, "loss/crossentropy": 2.2771997451782227, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2527083158493042, "step": 3770 }, { "epoch": 0.23575, "grad_norm": 2.84375, "grad_norm_var": 0.0128326416015625, "learning_rate": 0.0001, "loss": 8.0419, "loss/crossentropy": 2.3018823862075806, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2760903537273407, "step": 3772 }, { "epoch": 0.235875, "grad_norm": 2.6875, "grad_norm_var": 0.0157135009765625, "learning_rate": 0.0001, "loss": 7.7634, "loss/crossentropy": 2.147810459136963, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2414494976401329, "step": 3774 }, { "epoch": 0.236, "grad_norm": 2.890625, "grad_norm_var": 0.024267578125, "learning_rate": 0.0001, "loss": 8.0522, "loss/crossentropy": 2.5797489881515503, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25508859008550644, "step": 3776 }, { "epoch": 0.236125, "grad_norm": 2.890625, "grad_norm_var": 0.02109375, "learning_rate": 0.0001, "loss": 8.0457, "loss/crossentropy": 2.334934711456299, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24477149546146393, "step": 3778 }, { "epoch": 0.23625, "grad_norm": 2.78125, "grad_norm_var": 0.0207672119140625, "learning_rate": 0.0001, "loss": 7.7548, "loss/crossentropy": 2.496270179748535, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.25811365991830826, "step": 3780 }, { "epoch": 0.236375, "grad_norm": 2.96875, "grad_norm_var": 0.019074503580729166, "learning_rate": 0.0001, "loss": 8.0027, "loss/crossentropy": 2.565447449684143, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2610893249511719, "step": 3782 }, { "epoch": 0.2365, "grad_norm": 2.84375, "grad_norm_var": 0.022044881184895834, "learning_rate": 0.0001, "loss": 8.0034, "loss/crossentropy": 2.288913130760193, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24560889601707458, "step": 3784 }, { "epoch": 0.236625, "grad_norm": 2.921875, "grad_norm_var": 0.0223297119140625, "learning_rate": 0.0001, "loss": 7.9241, "loss/crossentropy": 2.4364962577819824, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26674768328666687, "step": 3786 }, { "epoch": 0.23675, "grad_norm": 2.875, "grad_norm_var": 0.018504842122395834, "learning_rate": 0.0001, "loss": 8.1114, "loss/crossentropy": 2.3721729516983032, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2508590742945671, "step": 3788 }, { "epoch": 0.236875, "grad_norm": 2.90625, "grad_norm_var": 0.014989217122395834, "learning_rate": 0.0001, "loss": 8.1629, "loss/crossentropy": 2.441157817840576, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2336357906460762, "step": 3790 }, { "epoch": 0.237, "grad_norm": 2.71875, "grad_norm_var": 0.0092437744140625, "learning_rate": 0.0001, "loss": 8.0601, "loss/crossentropy": 2.3760870695114136, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24517250061035156, "step": 3792 }, { "epoch": 0.237125, "grad_norm": 2.78125, "grad_norm_var": 0.009635416666666667, "learning_rate": 0.0001, "loss": 7.7626, "loss/crossentropy": 2.254652261734009, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23266702890396118, "step": 3794 }, { "epoch": 0.23725, "grad_norm": 2.953125, "grad_norm_var": 0.010985310872395833, "learning_rate": 0.0001, "loss": 7.9964, "loss/crossentropy": 2.3335236310958862, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24718395620584488, "step": 3796 }, { "epoch": 0.237375, "grad_norm": 2.8125, "grad_norm_var": 0.010505167643229167, "learning_rate": 0.0001, "loss": 7.9836, "loss/crossentropy": 2.146742820739746, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23980651795864105, "step": 3798 }, { "epoch": 0.2375, "grad_norm": 2.75, "grad_norm_var": 0.0105133056640625, "learning_rate": 0.0001, "loss": 8.0158, "loss/crossentropy": 2.413538336753845, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2546129822731018, "step": 3800 }, { "epoch": 0.237625, "grad_norm": 2.78125, "grad_norm_var": 0.010221354166666667, "learning_rate": 0.0001, "loss": 7.8914, "loss/crossentropy": 2.3957451581954956, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2576509118080139, "step": 3802 }, { "epoch": 0.23775, "grad_norm": 3.328125, "grad_norm_var": 0.0312652587890625, "learning_rate": 0.0001, "loss": 7.8871, "loss/crossentropy": 2.288939356803894, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2728182524442673, "step": 3804 }, { "epoch": 0.237875, "grad_norm": 2.828125, "grad_norm_var": 0.029573567708333335, "learning_rate": 0.0001, "loss": 7.9958, "loss/crossentropy": 2.3543388843536377, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.25791068375110626, "step": 3806 }, { "epoch": 0.238, "grad_norm": 3.078125, "grad_norm_var": 0.0307769775390625, "learning_rate": 0.0001, "loss": 8.0146, "loss/crossentropy": 2.452141761779785, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23337870091199875, "step": 3808 }, { "epoch": 0.238125, "grad_norm": 2.65625, "grad_norm_var": 0.03359375, "learning_rate": 0.0001, "loss": 7.8997, "loss/crossentropy": 2.3330483436584473, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.25078994035720825, "step": 3810 }, { "epoch": 0.23825, "grad_norm": 2.734375, "grad_norm_var": 0.03251851399739583, "learning_rate": 0.0001, "loss": 7.8959, "loss/crossentropy": 2.0975863933563232, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2229529693722725, "step": 3812 }, { "epoch": 0.238375, "grad_norm": 2.765625, "grad_norm_var": 0.03323567708333333, "learning_rate": 0.0001, "loss": 7.816, "loss/crossentropy": 2.0993716716766357, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24615749716758728, "step": 3814 }, { "epoch": 0.2385, "grad_norm": 3.171875, "grad_norm_var": 0.037984212239583336, "learning_rate": 0.0001, "loss": 8.0614, "loss/crossentropy": 2.336126208305359, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24437396228313446, "step": 3816 }, { "epoch": 0.238625, "grad_norm": 2.671875, "grad_norm_var": 0.039948527018229166, "learning_rate": 0.0001, "loss": 7.9033, "loss/crossentropy": 2.3778291940689087, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2515903264284134, "step": 3818 }, { "epoch": 0.23875, "grad_norm": 2.921875, "grad_norm_var": 0.019136555989583335, "learning_rate": 0.0001, "loss": 7.8767, "loss/crossentropy": 2.2030457258224487, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2293495386838913, "step": 3820 }, { "epoch": 0.238875, "grad_norm": 2.859375, "grad_norm_var": 0.02066650390625, "learning_rate": 0.0001, "loss": 7.8254, "loss/crossentropy": 2.393824577331543, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23858655989170074, "step": 3822 }, { "epoch": 0.239, "grad_norm": 2.640625, "grad_norm_var": 0.019579060872395835, "learning_rate": 0.0001, "loss": 7.7794, "loss/crossentropy": 2.202533006668091, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2381155639886856, "step": 3824 }, { "epoch": 0.239125, "grad_norm": 2.71875, "grad_norm_var": 0.017513020833333334, "learning_rate": 0.0001, "loss": 7.8826, "loss/crossentropy": 2.2683571577072144, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24590277671813965, "step": 3826 }, { "epoch": 0.23925, "grad_norm": 3.046875, "grad_norm_var": 0.0197662353515625, "learning_rate": 0.0001, "loss": 7.9919, "loss/crossentropy": 2.3534783124923706, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2405027374625206, "step": 3828 }, { "epoch": 0.239375, "grad_norm": 2.875, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 8.1266, "loss/crossentropy": 2.3716864585876465, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2505777180194855, "step": 3830 }, { "epoch": 0.2395, "grad_norm": 3.0, "grad_norm_var": 0.015751139322916666, "learning_rate": 0.0001, "loss": 7.947, "loss/crossentropy": 2.482056736946106, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2586553245782852, "step": 3832 }, { "epoch": 0.239625, "grad_norm": 2.953125, "grad_norm_var": 0.016730753580729167, "learning_rate": 0.0001, "loss": 8.0148, "loss/crossentropy": 2.323951005935669, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24310573935508728, "step": 3834 }, { "epoch": 0.23975, "grad_norm": 3.3125, "grad_norm_var": 0.027425130208333332, "learning_rate": 0.0001, "loss": 8.0583, "loss/crossentropy": 2.2353726625442505, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2633824050426483, "step": 3836 }, { "epoch": 0.239875, "grad_norm": 2.828125, "grad_norm_var": 0.02568359375, "learning_rate": 0.0001, "loss": 7.9074, "loss/crossentropy": 2.1682406663894653, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.25316306948661804, "step": 3838 }, { "epoch": 0.24, "grad_norm": 2.671875, "grad_norm_var": 0.0255767822265625, "learning_rate": 0.0001, "loss": 7.76, "loss/crossentropy": 2.3561817407608032, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24703025817871094, "step": 3840 }, { "epoch": 0.240125, "grad_norm": 2.84375, "grad_norm_var": 0.023346964518229166, "learning_rate": 0.0001, "loss": 7.8848, "loss/crossentropy": 2.1207789182662964, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24431215226650238, "step": 3842 }, { "epoch": 0.24025, "grad_norm": 2.796875, "grad_norm_var": 0.0224761962890625, "learning_rate": 0.0001, "loss": 7.8809, "loss/crossentropy": 2.177114486694336, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24755840003490448, "step": 3844 }, { "epoch": 0.240375, "grad_norm": 2.734375, "grad_norm_var": 0.02340087890625, "learning_rate": 0.0001, "loss": 7.8049, "loss/crossentropy": 2.241698145866394, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23916028439998627, "step": 3846 }, { "epoch": 0.2405, "grad_norm": 2.9375, "grad_norm_var": 0.023209635416666666, "learning_rate": 0.0001, "loss": 8.1238, "loss/crossentropy": 2.3270636796951294, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2830345705151558, "step": 3848 }, { "epoch": 0.240625, "grad_norm": 2.75, "grad_norm_var": 0.02125244140625, "learning_rate": 0.0001, "loss": 7.9985, "loss/crossentropy": 2.306910753250122, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24260255694389343, "step": 3850 }, { "epoch": 0.24075, "grad_norm": 2.65625, "grad_norm_var": 0.0091461181640625, "learning_rate": 0.0001, "loss": 7.898, "loss/crossentropy": 2.0596543550491333, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23719244450330734, "step": 3852 }, { "epoch": 0.240875, "grad_norm": 2.78125, "grad_norm_var": 0.009300740559895833, "learning_rate": 0.0001, "loss": 7.8158, "loss/crossentropy": 2.3392993211746216, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25461700558662415, "step": 3854 }, { "epoch": 0.241, "grad_norm": 3.015625, "grad_norm_var": 0.011649576822916667, "learning_rate": 0.0001, "loss": 7.9635, "loss/crossentropy": 2.7126669883728027, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2701471447944641, "step": 3856 }, { "epoch": 0.241125, "grad_norm": 2.5625, "grad_norm_var": 0.01881103515625, "learning_rate": 0.0001, "loss": 7.8456, "loss/crossentropy": 2.2381919622421265, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2312241867184639, "step": 3858 }, { "epoch": 0.24125, "grad_norm": 2.734375, "grad_norm_var": 0.019188435872395833, "learning_rate": 0.0001, "loss": 7.8947, "loss/crossentropy": 2.3995821475982666, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24393048137426376, "step": 3860 }, { "epoch": 0.241375, "grad_norm": 2.84375, "grad_norm_var": 0.0181060791015625, "learning_rate": 0.0001, "loss": 7.9092, "loss/crossentropy": 2.254945397377014, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24428366869688034, "step": 3862 }, { "epoch": 0.2415, "grad_norm": 2.640625, "grad_norm_var": 0.0165924072265625, "learning_rate": 0.0001, "loss": 7.8811, "loss/crossentropy": 2.392805576324463, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24890445917844772, "step": 3864 }, { "epoch": 0.241625, "grad_norm": 2.90625, "grad_norm_var": 0.016923014322916666, "learning_rate": 0.0001, "loss": 8.0593, "loss/crossentropy": 2.224582314491272, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2585388273000717, "step": 3866 }, { "epoch": 0.24175, "grad_norm": 2.859375, "grad_norm_var": 0.015851847330729165, "learning_rate": 0.0001, "loss": 8.0158, "loss/crossentropy": 2.367936611175537, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23479987680912018, "step": 3868 }, { "epoch": 0.241875, "grad_norm": 2.953125, "grad_norm_var": 0.0171295166015625, "learning_rate": 0.0001, "loss": 7.9695, "loss/crossentropy": 2.394118547439575, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2584435045719147, "step": 3870 }, { "epoch": 0.242, "grad_norm": 3.234375, "grad_norm_var": 0.0255859375, "learning_rate": 0.0001, "loss": 8.2134, "loss/crossentropy": 2.4198135137557983, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2768760621547699, "step": 3872 }, { "epoch": 0.242125, "grad_norm": 2.84375, "grad_norm_var": 0.019270833333333334, "learning_rate": 0.0001, "loss": 7.9406, "loss/crossentropy": 2.373674988746643, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24084321409463882, "step": 3874 }, { "epoch": 0.24225, "grad_norm": 2.859375, "grad_norm_var": 0.017186482747395832, "learning_rate": 0.0001, "loss": 8.0494, "loss/crossentropy": 2.1835745573043823, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23795650899410248, "step": 3876 }, { "epoch": 0.242375, "grad_norm": 3.15625, "grad_norm_var": 0.020699055989583333, "learning_rate": 0.0001, "loss": 8.1841, "loss/crossentropy": 2.590933918952942, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2828524559736252, "step": 3878 }, { "epoch": 0.2425, "grad_norm": 2.6875, "grad_norm_var": 0.019820149739583334, "learning_rate": 0.0001, "loss": 7.9971, "loss/crossentropy": 2.4417784214019775, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2358105331659317, "step": 3880 }, { "epoch": 0.242625, "grad_norm": 2.875, "grad_norm_var": 0.0228515625, "learning_rate": 0.0001, "loss": 7.927, "loss/crossentropy": 2.514025568962097, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25648288428783417, "step": 3882 }, { "epoch": 0.24275, "grad_norm": 2.90625, "grad_norm_var": 0.022574869791666667, "learning_rate": 0.0001, "loss": 8.0021, "loss/crossentropy": 2.2680565118789673, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24664485454559326, "step": 3884 }, { "epoch": 0.242875, "grad_norm": 2.828125, "grad_norm_var": 0.0239898681640625, "learning_rate": 0.0001, "loss": 7.648, "loss/crossentropy": 2.0859988927841187, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2396468073129654, "step": 3886 }, { "epoch": 0.243, "grad_norm": 2.921875, "grad_norm_var": 0.016022745768229166, "learning_rate": 0.0001, "loss": 7.9453, "loss/crossentropy": 2.346192240715027, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2454293891787529, "step": 3888 }, { "epoch": 0.243125, "grad_norm": 2.734375, "grad_norm_var": 0.016630045572916665, "learning_rate": 0.0001, "loss": 7.8395, "loss/crossentropy": 2.2563360929489136, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25612016022205353, "step": 3890 }, { "epoch": 0.24325, "grad_norm": 3.0, "grad_norm_var": 0.017838541666666666, "learning_rate": 0.0001, "loss": 7.9441, "loss/crossentropy": 2.5668708086013794, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27752047777175903, "step": 3892 }, { "epoch": 0.243375, "grad_norm": 3.015625, "grad_norm_var": 0.013785807291666667, "learning_rate": 0.0001, "loss": 7.7886, "loss/crossentropy": 2.1933363676071167, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23820041120052338, "step": 3894 }, { "epoch": 0.2435, "grad_norm": 2.828125, "grad_norm_var": 0.010480753580729167, "learning_rate": 0.0001, "loss": 7.9344, "loss/crossentropy": 2.3005311489105225, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24230662733316422, "step": 3896 }, { "epoch": 0.243625, "grad_norm": 2.765625, "grad_norm_var": 0.009012858072916666, "learning_rate": 0.0001, "loss": 7.9342, "loss/crossentropy": 2.390279769897461, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24232257902622223, "step": 3898 }, { "epoch": 0.24375, "grad_norm": 2.96875, "grad_norm_var": 0.007624308268229167, "learning_rate": 0.0001, "loss": 7.9865, "loss/crossentropy": 2.3709793090820312, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.27256976068019867, "step": 3900 }, { "epoch": 0.243875, "grad_norm": 2.6875, "grad_norm_var": 0.008885701497395834, "learning_rate": 0.0001, "loss": 7.7007, "loss/crossentropy": 2.0079593658447266, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.223800927400589, "step": 3902 }, { "epoch": 0.244, "grad_norm": 3.046875, "grad_norm_var": 0.010774739583333333, "learning_rate": 0.0001, "loss": 8.0776, "loss/crossentropy": 2.421340227127075, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.25970883667469025, "step": 3904 }, { "epoch": 0.244125, "grad_norm": 3.09375, "grad_norm_var": 0.011644490559895833, "learning_rate": 0.0001, "loss": 7.944, "loss/crossentropy": 2.234217405319214, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24383071064949036, "step": 3906 }, { "epoch": 0.24425, "grad_norm": 2.921875, "grad_norm_var": 0.010749308268229167, "learning_rate": 0.0001, "loss": 8.063, "loss/crossentropy": 2.5897743701934814, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.25560788810253143, "step": 3908 }, { "epoch": 0.244375, "grad_norm": 3.25, "grad_norm_var": 0.09846598307291667, "learning_rate": 0.0001, "loss": 7.9418, "loss/crossentropy": 2.1307941675186157, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26282523572444916, "step": 3910 }, { "epoch": 0.2445, "grad_norm": 3.03125, "grad_norm_var": 0.0960357666015625, "learning_rate": 0.0001, "loss": 8.0311, "loss/crossentropy": 2.1970854997634888, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24500936269760132, "step": 3912 }, { "epoch": 0.244625, "grad_norm": 2.84375, "grad_norm_var": 0.09399312337239583, "learning_rate": 0.0001, "loss": 8.0802, "loss/crossentropy": 2.384564757347107, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26147788763046265, "step": 3914 }, { "epoch": 0.24475, "grad_norm": 2.765625, "grad_norm_var": 0.09719645182291667, "learning_rate": 0.0001, "loss": 7.9915, "loss/crossentropy": 2.3258496522903442, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24284511804580688, "step": 3916 }, { "epoch": 0.244875, "grad_norm": 2.671875, "grad_norm_var": 0.09752197265625, "learning_rate": 0.0001, "loss": 7.7315, "loss/crossentropy": 2.225629687309265, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25035693496465683, "step": 3918 }, { "epoch": 0.245, "grad_norm": 2.84375, "grad_norm_var": 0.09841206868489584, "learning_rate": 0.0001, "loss": 7.9674, "loss/crossentropy": 2.0987013578414917, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23937055468559265, "step": 3920 }, { "epoch": 0.245125, "grad_norm": 2.875, "grad_norm_var": 0.09804280598958333, "learning_rate": 0.0001, "loss": 8.0213, "loss/crossentropy": 2.294238328933716, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25349240005016327, "step": 3922 }, { "epoch": 0.24525, "grad_norm": 2.8125, "grad_norm_var": 0.10007222493489583, "learning_rate": 0.0001, "loss": 8.0705, "loss/crossentropy": 2.1828103065490723, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25784528255462646, "step": 3924 }, { "epoch": 0.245375, "grad_norm": 3.109375, "grad_norm_var": 0.0164703369140625, "learning_rate": 0.0001, "loss": 7.986, "loss/crossentropy": 2.023577570915222, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24002478271722794, "step": 3926 }, { "epoch": 0.2455, "grad_norm": 2.9375, "grad_norm_var": 0.01533203125, "learning_rate": 0.0001, "loss": 7.9938, "loss/crossentropy": 2.319468140602112, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2421136051416397, "step": 3928 }, { "epoch": 0.245625, "grad_norm": 2.734375, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 7.8347, "loss/crossentropy": 2.2601557970046997, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2663475573062897, "step": 3930 }, { "epoch": 0.24575, "grad_norm": 3.265625, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 8.3085, "loss/crossentropy": 2.735992908477783, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2462017834186554, "step": 3932 }, { "epoch": 0.245875, "grad_norm": 2.90625, "grad_norm_var": 0.019774373372395834, "learning_rate": 0.0001, "loss": 7.9586, "loss/crossentropy": 2.361487627029419, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24462847411632538, "step": 3934 }, { "epoch": 0.246, "grad_norm": 2.796875, "grad_norm_var": 0.031050618489583334, "learning_rate": 0.0001, "loss": 7.8382, "loss/crossentropy": 2.1037662029266357, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24985255300998688, "step": 3936 }, { "epoch": 0.246125, "grad_norm": 2.828125, "grad_norm_var": 0.03432515462239583, "learning_rate": 0.0001, "loss": 7.9081, "loss/crossentropy": 2.383851647377014, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23125454783439636, "step": 3938 }, { "epoch": 0.24625, "grad_norm": 3.0, "grad_norm_var": 0.03535054524739583, "learning_rate": 0.0001, "loss": 8.0179, "loss/crossentropy": 2.383346676826477, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2499464601278305, "step": 3940 }, { "epoch": 0.246375, "grad_norm": 2.796875, "grad_norm_var": 0.0344146728515625, "learning_rate": 0.0001, "loss": 7.9388, "loss/crossentropy": 2.221991539001465, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.25429509580135345, "step": 3942 }, { "epoch": 0.2465, "grad_norm": 2.8125, "grad_norm_var": 0.035074869791666664, "learning_rate": 0.0001, "loss": 7.7599, "loss/crossentropy": 2.186113476753235, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.22729599475860596, "step": 3944 }, { "epoch": 0.246625, "grad_norm": 2.890625, "grad_norm_var": 0.030223592122395834, "learning_rate": 0.0001, "loss": 7.8738, "loss/crossentropy": 2.5532344579696655, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2494053915143013, "step": 3946 }, { "epoch": 0.24675, "grad_norm": 2.953125, "grad_norm_var": 0.02056884765625, "learning_rate": 0.0001, "loss": 7.9992, "loss/crossentropy": 2.3294448852539062, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24838833510875702, "step": 3948 }, { "epoch": 0.246875, "grad_norm": 3.296875, "grad_norm_var": 0.02877197265625, "learning_rate": 0.0001, "loss": 7.8963, "loss/crossentropy": 2.1606216430664062, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25755712389945984, "step": 3950 }, { "epoch": 0.247, "grad_norm": 2.671875, "grad_norm_var": 0.0237457275390625, "learning_rate": 0.0001, "loss": 7.9671, "loss/crossentropy": 2.2953662872314453, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2478877156972885, "step": 3952 }, { "epoch": 0.247125, "grad_norm": 2.953125, "grad_norm_var": 0.022272745768229168, "learning_rate": 0.0001, "loss": 8.0112, "loss/crossentropy": 2.187902808189392, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2752064913511276, "step": 3954 }, { "epoch": 0.24725, "grad_norm": 3.015625, "grad_norm_var": 0.021805826822916666, "learning_rate": 0.0001, "loss": 7.9066, "loss/crossentropy": 2.289653182029724, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24460161477327347, "step": 3956 }, { "epoch": 0.247375, "grad_norm": 2.828125, "grad_norm_var": 0.021711222330729165, "learning_rate": 0.0001, "loss": 7.7137, "loss/crossentropy": 1.9959831833839417, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22675126791000366, "step": 3958 }, { "epoch": 0.2475, "grad_norm": 3.0625, "grad_norm_var": 0.023502604166666666, "learning_rate": 0.0001, "loss": 8.0294, "loss/crossentropy": 2.286497712135315, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2442842796444893, "step": 3960 }, { "epoch": 0.247625, "grad_norm": 2.75, "grad_norm_var": 0.024137369791666665, "learning_rate": 0.0001, "loss": 7.9915, "loss/crossentropy": 2.2502633333206177, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2236596718430519, "step": 3962 }, { "epoch": 0.24775, "grad_norm": 2.75, "grad_norm_var": 0.025169881184895833, "learning_rate": 0.0001, "loss": 7.9153, "loss/crossentropy": 2.2781342267990112, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24574154615402222, "step": 3964 }, { "epoch": 0.247875, "grad_norm": 2.90625, "grad_norm_var": 0.013765462239583333, "learning_rate": 0.0001, "loss": 7.8426, "loss/crossentropy": 2.119445323944092, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23813114315271378, "step": 3966 }, { "epoch": 0.248, "grad_norm": 3.03125, "grad_norm_var": 0.013841756184895833, "learning_rate": 0.0001, "loss": 8.0836, "loss/crossentropy": 2.188897430896759, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2473863735795021, "step": 3968 }, { "epoch": 0.248125, "grad_norm": 2.828125, "grad_norm_var": 0.0142486572265625, "learning_rate": 0.0001, "loss": 7.9057, "loss/crossentropy": 2.279172897338867, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26379649341106415, "step": 3970 }, { "epoch": 0.24825, "grad_norm": 2.875, "grad_norm_var": 0.012116495768229167, "learning_rate": 0.0001, "loss": 7.7416, "loss/crossentropy": 2.3670825958251953, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22166435420513153, "step": 3972 }, { "epoch": 0.248375, "grad_norm": 2.984375, "grad_norm_var": 0.014484659830729166, "learning_rate": 0.0001, "loss": 7.9191, "loss/crossentropy": 2.4747731685638428, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24381603300571442, "step": 3974 }, { "epoch": 0.2485, "grad_norm": 2.90625, "grad_norm_var": 0.0111968994140625, "learning_rate": 0.0001, "loss": 7.9802, "loss/crossentropy": 2.2822595834732056, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24216417968273163, "step": 3976 }, { "epoch": 0.248625, "grad_norm": 2.671875, "grad_norm_var": 0.013016764322916667, "learning_rate": 0.0001, "loss": 7.855, "loss/crossentropy": 2.4248218536376953, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2518390789628029, "step": 3978 }, { "epoch": 0.24875, "grad_norm": 2.9375, "grad_norm_var": 0.012300618489583333, "learning_rate": 0.0001, "loss": 7.9805, "loss/crossentropy": 2.379852533340454, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2360517606139183, "step": 3980 }, { "epoch": 0.248875, "grad_norm": 2.90625, "grad_norm_var": 0.010431925455729166, "learning_rate": 0.0001, "loss": 7.9083, "loss/crossentropy": 2.3196099996566772, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23856279253959656, "step": 3982 }, { "epoch": 0.249, "grad_norm": 2.9375, "grad_norm_var": 0.011432902018229166, "learning_rate": 0.0001, "loss": 7.898, "loss/crossentropy": 2.286087989807129, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24119911342859268, "step": 3984 }, { "epoch": 0.249125, "grad_norm": 2.953125, "grad_norm_var": 0.010904947916666666, "learning_rate": 0.0001, "loss": 7.9097, "loss/crossentropy": 2.2220507860183716, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2348850816488266, "step": 3986 }, { "epoch": 0.24925, "grad_norm": 2.875, "grad_norm_var": 0.010933430989583333, "learning_rate": 0.0001, "loss": 8.1498, "loss/crossentropy": 2.376457691192627, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2478000372648239, "step": 3988 }, { "epoch": 0.249375, "grad_norm": 2.78125, "grad_norm_var": 0.008552042643229167, "learning_rate": 0.0001, "loss": 7.9849, "loss/crossentropy": 2.367288827896118, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25519511103630066, "step": 3990 }, { "epoch": 0.2495, "grad_norm": 3.09375, "grad_norm_var": 0.012495930989583333, "learning_rate": 0.0001, "loss": 8.0479, "loss/crossentropy": 2.317150592803955, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25810791552066803, "step": 3992 }, { "epoch": 0.249625, "grad_norm": 2.78125, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 7.8396, "loss/crossentropy": 2.1635915637016296, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2583113983273506, "step": 3994 }, { "epoch": 0.24975, "grad_norm": 2.640625, "grad_norm_var": 0.013451131184895833, "learning_rate": 0.0001, "loss": 7.8905, "loss/crossentropy": 2.036882519721985, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23229141533374786, "step": 3996 }, { "epoch": 0.249875, "grad_norm": 2.78125, "grad_norm_var": 0.01529541015625, "learning_rate": 0.0001, "loss": 7.9271, "loss/crossentropy": 2.4220268726348877, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2348419427871704, "step": 3998 }, { "epoch": 0.25, "grad_norm": 2.75, "grad_norm_var": 0.013655598958333333, "learning_rate": 0.0001, "loss": 7.6323, "loss/crossentropy": 2.0106801986694336, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2393062487244606, "step": 4000 } ], "logging_steps": 2, "max_steps": 16000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.16590621310976e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }