diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24027 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25, + "eval_steps": 2000, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000125, + "grad_norm": 428.0, + "learning_rate": 1.18e-05, + "loss": 99.3619, + "loss/crossentropy": 9.37970495223999, + "loss/hidden": 16.625, + "loss/jsd": 0.0, + "loss/logits": 7.257124900817871, + "step": 2 + }, + { + "epoch": 0.00025, + "grad_norm": 356.0, + "learning_rate": 1.3600000000000002e-05, + "loss": 97.1216, + "loss/crossentropy": 9.159881591796875, + "loss/hidden": 16.625, + "loss/jsd": 0.0, + "loss/logits": 7.145160675048828, + "step": 4 + }, + { + "epoch": 0.000375, + "grad_norm": 380.0, + "learning_rate": 1.54e-05, + "loss": 100.1942, + "loss/crossentropy": 9.186327457427979, + "loss/hidden": 16.625, + "loss/jsd": 0.0, + "loss/logits": 7.2050676345825195, + "step": 6 + }, + { + "epoch": 0.0005, + "grad_norm": 185.0, + "learning_rate": 1.72e-05, + "loss": 95.7478, + "loss/crossentropy": 8.873358249664307, + "loss/hidden": 16.5625, + "loss/jsd": 0.0, + "loss/logits": 6.897953987121582, + "step": 8 + }, + { + "epoch": 0.000625, + "grad_norm": 163.0, + "learning_rate": 1.9e-05, + "loss": 92.634, + "loss/crossentropy": 8.72307538986206, + "loss/hidden": 16.375, + "loss/jsd": 0.0, + "loss/logits": 6.7046730518341064, + "step": 10 + }, + { + "epoch": 0.00075, + "grad_norm": 153.0, + "learning_rate": 2.0800000000000004e-05, + "loss": 88.4929, + "loss/crossentropy": 8.53145456314087, + "loss/hidden": 16.3125, + "loss/jsd": 0.0, + "loss/logits": 6.585271596908569, + "step": 12 + }, + { + "epoch": 0.000875, + "grad_norm": 153.0, + "learning_rate": 2.2600000000000004e-05, + "loss": 86.7945, + "loss/crossentropy": 8.174943923950195, + "loss/hidden": 16.25, + "loss/jsd": 0.0, + "loss/logits": 6.037625789642334, + "step": 14 + }, + { + "epoch": 0.001, + "grad_norm": 121.0, + "grad_norm_var": 14208.2, + "learning_rate": 2.4400000000000004e-05, + "loss": 81.4154, + "loss/crossentropy": 7.956912994384766, + "loss/hidden": 15.8125, + "loss/jsd": 0.0, + "loss/logits": 5.961349010467529, + "step": 16 + }, + { + "epoch": 0.001125, + "grad_norm": 136.0, + "grad_norm_var": 9968.116666666667, + "learning_rate": 2.6200000000000003e-05, + "loss": 80.922, + "loss/crossentropy": 7.872050046920776, + "loss/hidden": 15.34375, + "loss/jsd": 0.0, + "loss/logits": 5.787276268005371, + "step": 18 + }, + { + "epoch": 0.00125, + "grad_norm": 64.5, + "grad_norm_var": 8084.873958333334, + "learning_rate": 2.8000000000000003e-05, + "loss": 76.7299, + "loss/crossentropy": 7.452724456787109, + "loss/hidden": 15.21875, + "loss/jsd": 0.0, + "loss/logits": 5.1816017627716064, + "step": 20 + }, + { + "epoch": 0.001375, + "grad_norm": 43.0, + "grad_norm_var": 4135.966666666666, + "learning_rate": 2.9800000000000006e-05, + "loss": 74.1086, + "loss/crossentropy": 7.244980812072754, + "loss/hidden": 15.0625, + "loss/jsd": 0.0, + "loss/logits": 5.154205322265625, + "step": 22 + }, + { + "epoch": 0.0015, + "grad_norm": 49.0, + "grad_norm_var": 3930.31640625, + "learning_rate": 3.16e-05, + "loss": 71.8732, + "loss/crossentropy": 7.059436321258545, + "loss/hidden": 15.0, + "loss/jsd": 0.0, + "loss/logits": 4.9876039028167725, + "step": 24 + }, + { + "epoch": 0.001625, + "grad_norm": 77.5, + "grad_norm_var": 3864.5622395833334, + "learning_rate": 3.3400000000000005e-05, + "loss": 66.5866, + "loss/crossentropy": 6.667392730712891, + "loss/hidden": 14.90625, + "loss/jsd": 0.0, + "loss/logits": 4.435611724853516, + "step": 26 + }, + { + "epoch": 0.00175, + "grad_norm": 72.0, + "grad_norm_var": 3618.5559895833335, + "learning_rate": 3.520000000000001e-05, + "loss": 62.1951, + "loss/crossentropy": 6.15596079826355, + "loss/hidden": 14.15625, + "loss/jsd": 0.0, + "loss/logits": 4.280491590499878, + "step": 28 + }, + { + "epoch": 0.001875, + "grad_norm": 66.5, + "grad_norm_var": 3451.262239583333, + "learning_rate": 3.7e-05, + "loss": 59.1185, + "loss/crossentropy": 5.833110809326172, + "loss/hidden": 13.6875, + "loss/jsd": 0.0, + "loss/logits": 3.8512399196624756, + "step": 30 + }, + { + "epoch": 0.002, + "grad_norm": 63.0, + "grad_norm_var": 3467.72890625, + "learning_rate": 3.88e-05, + "loss": 54.6163, + "loss/crossentropy": 5.594937324523926, + "loss/hidden": 13.40625, + "loss/jsd": 0.0, + "loss/logits": 3.619442343711853, + "step": 32 + }, + { + "epoch": 0.002125, + "grad_norm": 70.0, + "grad_norm_var": 585.5997395833333, + "learning_rate": 4.0600000000000004e-05, + "loss": 49.8176, + "loss/crossentropy": 5.316510438919067, + "loss/hidden": 12.96875, + "loss/jsd": 0.0, + "loss/logits": 3.1636284589767456, + "step": 34 + }, + { + "epoch": 0.00225, + "grad_norm": 54.25, + "grad_norm_var": 208.6875, + "learning_rate": 4.240000000000001e-05, + "loss": 44.5006, + "loss/crossentropy": 4.8605875968933105, + "loss/hidden": 12.03125, + "loss/jsd": 0.0, + "loss/logits": 2.5045835971832275, + "step": 36 + }, + { + "epoch": 0.002375, + "grad_norm": 43.0, + "grad_norm_var": 205.99140625, + "learning_rate": 4.420000000000001e-05, + "loss": 41.8596, + "loss/crossentropy": 4.582718849182129, + "loss/hidden": 12.0, + "loss/jsd": 0.0, + "loss/logits": 2.616296648979187, + "step": 38 + }, + { + "epoch": 0.0025, + "grad_norm": 39.75, + "grad_norm_var": 217.99140625, + "learning_rate": 4.600000000000001e-05, + "loss": 38.6887, + "loss/crossentropy": 4.197612762451172, + "loss/hidden": 11.21875, + "loss/jsd": 0.0, + "loss/logits": 2.2158924341201782, + "step": 40 + }, + { + "epoch": 0.002625, + "grad_norm": 57.75, + "grad_norm_var": 215.47395833333334, + "learning_rate": 4.78e-05, + "loss": 35.5871, + "loss/crossentropy": 4.077736258506775, + "loss/hidden": 10.59375, + "loss/jsd": 0.0, + "loss/logits": 1.8813174366950989, + "step": 42 + }, + { + "epoch": 0.00275, + "grad_norm": 33.75, + "grad_norm_var": 158.04140625, + "learning_rate": 4.96e-05, + "loss": 33.6872, + "loss/crossentropy": 4.088571310043335, + "loss/hidden": 10.46875, + "loss/jsd": 0.0, + "loss/logits": 1.9159515500068665, + "step": 44 + }, + { + "epoch": 0.002875, + "grad_norm": 27.25, + "grad_norm_var": 173.02057291666668, + "learning_rate": 5.14e-05, + "loss": 31.5202, + "loss/crossentropy": 3.7112059593200684, + "loss/hidden": 10.09375, + "loss/jsd": 0.0, + "loss/logits": 1.8669533133506775, + "step": 46 + }, + { + "epoch": 0.003, + "grad_norm": 30.625, + "grad_norm_var": 153.88170572916667, + "learning_rate": 5.3200000000000006e-05, + "loss": 29.7576, + "loss/crossentropy": 3.6459821462631226, + "loss/hidden": 9.875, + "loss/jsd": 0.0, + "loss/logits": 1.7125096917152405, + "step": 48 + }, + { + "epoch": 0.003125, + "grad_norm": 24.625, + "grad_norm_var": 156.196875, + "learning_rate": 5.500000000000001e-05, + "loss": 28.6917, + "loss/crossentropy": 3.534511685371399, + "loss/hidden": 9.59375, + "loss/jsd": 0.0, + "loss/logits": 1.5916491150856018, + "step": 50 + }, + { + "epoch": 0.00325, + "grad_norm": 26.0, + "grad_norm_var": 119.83958333333334, + "learning_rate": 5.680000000000001e-05, + "loss": 27.6631, + "loss/crossentropy": 3.3382843732833862, + "loss/hidden": 9.25, + "loss/jsd": 0.0, + "loss/logits": 1.4924674034118652, + "step": 52 + }, + { + "epoch": 0.003375, + "grad_norm": 21.25, + "grad_norm_var": 231.38541666666666, + "learning_rate": 5.860000000000001e-05, + "loss": 26.1809, + "loss/crossentropy": 3.426845669746399, + "loss/hidden": 9.125, + "loss/jsd": 0.0, + "loss/logits": 1.3821245431900024, + "step": 54 + }, + { + "epoch": 0.0035, + "grad_norm": 21.25, + "grad_norm_var": 249.90358072916666, + "learning_rate": 6.040000000000001e-05, + "loss": 25.1479, + "loss/crossentropy": 3.4015276432037354, + "loss/hidden": 8.65625, + "loss/jsd": 0.0, + "loss/logits": 1.259027361869812, + "step": 56 + }, + { + "epoch": 0.003625, + "grad_norm": 59.75, + "grad_norm_var": 251.05416666666667, + "learning_rate": 6.220000000000001e-05, + "loss": 24.6783, + "loss/crossentropy": 3.363521456718445, + "loss/hidden": 8.5625, + "loss/jsd": 0.0, + "loss/logits": 1.218069314956665, + "step": 58 + }, + { + "epoch": 0.00375, + "grad_norm": 14.9375, + "grad_norm_var": 283.126806640625, + "learning_rate": 6.400000000000001e-05, + "loss": 23.6541, + "loss/crossentropy": 3.4112290143966675, + "loss/hidden": 8.5, + "loss/jsd": 0.0, + "loss/logits": 1.222625195980072, + "step": 60 + }, + { + "epoch": 0.003875, + "grad_norm": 21.25, + "grad_norm_var": 301.22120768229166, + "learning_rate": 6.58e-05, + "loss": 22.9688, + "loss/crossentropy": 3.150188446044922, + "loss/hidden": 8.28125, + "loss/jsd": 0.0, + "loss/logits": 1.157865822315216, + "step": 62 + }, + { + "epoch": 0.004, + "grad_norm": 20.25, + "grad_norm_var": 336.315087890625, + "learning_rate": 6.76e-05, + "loss": 22.4484, + "loss/crossentropy": 3.2142295837402344, + "loss/hidden": 8.078125, + "loss/jsd": 0.0, + "loss/logits": 1.1245205402374268, + "step": 64 + }, + { + "epoch": 0.004125, + "grad_norm": 20.25, + "grad_norm_var": 350.4535807291667, + "learning_rate": 6.94e-05, + "loss": 21.3778, + "loss/crossentropy": 3.3086668252944946, + "loss/hidden": 7.78125, + "loss/jsd": 0.0, + "loss/logits": 1.0809211134910583, + "step": 66 + }, + { + "epoch": 0.00425, + "grad_norm": 17.75, + "grad_norm_var": 363.0113932291667, + "learning_rate": 7.120000000000001e-05, + "loss": 20.7697, + "loss/crossentropy": 3.0438809394836426, + "loss/hidden": 7.578125, + "loss/jsd": 0.0, + "loss/logits": 0.9832420945167542, + "step": 68 + }, + { + "epoch": 0.004375, + "grad_norm": 16.625, + "grad_norm_var": 176.550634765625, + "learning_rate": 7.3e-05, + "loss": 20.7695, + "loss/crossentropy": 3.005946159362793, + "loss/hidden": 7.609375, + "loss/jsd": 0.0, + "loss/logits": 0.979515790939331, + "step": 70 + }, + { + "epoch": 0.0045, + "grad_norm": 14.6875, + "grad_norm_var": 182.58118489583333, + "learning_rate": 7.48e-05, + "loss": 20.2026, + "loss/crossentropy": 3.1091376543045044, + "loss/hidden": 7.46875, + "loss/jsd": 0.0, + "loss/logits": 0.994384378194809, + "step": 72 + }, + { + "epoch": 0.004625, + "grad_norm": 19.75, + "grad_norm_var": 81.90494791666667, + "learning_rate": 7.66e-05, + "loss": 19.8715, + "loss/crossentropy": 3.172377586364746, + "loss/hidden": 7.359375, + "loss/jsd": 0.0, + "loss/logits": 1.0095622539520264, + "step": 74 + }, + { + "epoch": 0.00475, + "grad_norm": 17.0, + "grad_norm_var": 81.86875, + "learning_rate": 7.840000000000001e-05, + "loss": 19.2123, + "loss/crossentropy": 2.75563645362854, + "loss/hidden": 7.125, + "loss/jsd": 0.0, + "loss/logits": 0.8763986825942993, + "step": 76 + }, + { + "epoch": 0.004875, + "grad_norm": 15.9375, + "grad_norm_var": 84.11599934895834, + "learning_rate": 8.020000000000001e-05, + "loss": 19.1961, + "loss/crossentropy": 2.981382369995117, + "loss/hidden": 7.328125, + "loss/jsd": 0.0, + "loss/logits": 0.9651070237159729, + "step": 78 + }, + { + "epoch": 0.005, + "grad_norm": 15.1875, + "grad_norm_var": 2.9731770833333333, + "learning_rate": 8.200000000000001e-05, + "loss": 18.5459, + "loss/crossentropy": 3.1191996335983276, + "loss/hidden": 7.1875, + "loss/jsd": 0.0, + "loss/logits": 0.8418412506580353, + "step": 80 + }, + { + "epoch": 0.005125, + "grad_norm": 14.6875, + "grad_norm_var": 2.395166015625, + "learning_rate": 8.38e-05, + "loss": 18.149, + "loss/crossentropy": 2.564948797225952, + "loss/hidden": 6.9375, + "loss/jsd": 0.0, + "loss/logits": 0.8517245650291443, + "step": 82 + }, + { + "epoch": 0.00525, + "grad_norm": 16.0, + "grad_norm_var": 2.08125, + "learning_rate": 8.560000000000001e-05, + "loss": 18.4829, + "loss/crossentropy": 3.1382123231887817, + "loss/hidden": 7.0625, + "loss/jsd": 0.0, + "loss/logits": 0.9703748524188995, + "step": 84 + }, + { + "epoch": 0.005375, + "grad_norm": 15.9375, + "grad_norm_var": 2.0563639322916667, + "learning_rate": 8.740000000000001e-05, + "loss": 17.9624, + "loss/crossentropy": 3.091128706932068, + "loss/hidden": 6.75, + "loss/jsd": 0.0, + "loss/logits": 0.8826551735401154, + "step": 86 + }, + { + "epoch": 0.0055, + "grad_norm": 15.25, + "grad_norm_var": 1.9325358072916667, + "learning_rate": 8.92e-05, + "loss": 17.5696, + "loss/crossentropy": 2.9899988174438477, + "loss/hidden": 6.65625, + "loss/jsd": 0.0, + "loss/logits": 0.7654303312301636, + "step": 88 + }, + { + "epoch": 0.005625, + "grad_norm": 16.25, + "grad_norm_var": 0.789306640625, + "learning_rate": 9.1e-05, + "loss": 17.0042, + "loss/crossentropy": 2.8025410175323486, + "loss/hidden": 6.65625, + "loss/jsd": 0.0, + "loss/logits": 0.7915366590023041, + "step": 90 + }, + { + "epoch": 0.00575, + "grad_norm": 13.625, + "grad_norm_var": 2.162744140625, + "learning_rate": 9.28e-05, + "loss": 17.2984, + "loss/crossentropy": 3.013433814048767, + "loss/hidden": 6.5625, + "loss/jsd": 0.0, + "loss/logits": 0.8098262250423431, + "step": 92 + }, + { + "epoch": 0.005875, + "grad_norm": 16.5, + "grad_norm_var": 2.249739583333333, + "learning_rate": 9.46e-05, + "loss": 17.1342, + "loss/crossentropy": 2.912646174430847, + "loss/hidden": 6.453125, + "loss/jsd": 0.0, + "loss/logits": 0.8038456439971924, + "step": 94 + }, + { + "epoch": 0.006, + "grad_norm": 11.625, + "grad_norm_var": 2.714306640625, + "learning_rate": 9.64e-05, + "loss": 16.7369, + "loss/crossentropy": 2.9029338359832764, + "loss/hidden": 6.453125, + "loss/jsd": 0.0, + "loss/logits": 0.7524089217185974, + "step": 96 + }, + { + "epoch": 0.006125, + "grad_norm": 12.375, + "grad_norm_var": 4.069124348958334, + "learning_rate": 9.82e-05, + "loss": 16.4013, + "loss/crossentropy": 2.8420186042785645, + "loss/hidden": 6.390625, + "loss/jsd": 0.0, + "loss/logits": 0.7496578097343445, + "step": 98 + }, + { + "epoch": 0.00625, + "grad_norm": 12.5, + "grad_norm_var": 3.99140625, + "learning_rate": 0.0001, + "loss": 16.5129, + "loss/crossentropy": 2.7955269813537598, + "loss/hidden": 6.34375, + "loss/jsd": 0.0, + "loss/logits": 0.7174519896507263, + "step": 100 + }, + { + "epoch": 0.006375, + "grad_norm": 11.6875, + "grad_norm_var": 3.717431640625, + "learning_rate": 0.0001, + "loss": 16.1005, + "loss/crossentropy": 2.890980839729309, + "loss/hidden": 6.3125, + "loss/jsd": 0.0, + "loss/logits": 0.7031005620956421, + "step": 102 + }, + { + "epoch": 0.0065, + "grad_norm": 10.1875, + "grad_norm_var": 4.207796223958334, + "learning_rate": 0.0001, + "loss": 16.0555, + "loss/crossentropy": 2.8553627729415894, + "loss/hidden": 6.1875, + "loss/jsd": 0.0, + "loss/logits": 0.6933196187019348, + "step": 104 + }, + { + "epoch": 0.006625, + "grad_norm": 15.0625, + "grad_norm_var": 3.723372395833333, + "learning_rate": 0.0001, + "loss": 16.25, + "loss/crossentropy": 2.7866374254226685, + "loss/hidden": 6.140625, + "loss/jsd": 0.0, + "loss/logits": 0.6986292898654938, + "step": 106 + }, + { + "epoch": 0.00675, + "grad_norm": 12.25, + "grad_norm_var": 3.645556640625, + "learning_rate": 0.0001, + "loss": 16.4036, + "loss/crossentropy": 2.8795191049575806, + "loss/hidden": 6.203125, + "loss/jsd": 0.0, + "loss/logits": 0.6980823576450348, + "step": 108 + }, + { + "epoch": 0.006875, + "grad_norm": 11.3125, + "grad_norm_var": 2.959830729166667, + "learning_rate": 0.0001, + "loss": 15.5029, + "loss/crossentropy": 2.6974622011184692, + "loss/hidden": 6.265625, + "loss/jsd": 0.0, + "loss/logits": 0.7379841208457947, + "step": 110 + }, + { + "epoch": 0.007, + "grad_norm": 14.375, + "grad_norm_var": 2.8355305989583335, + "learning_rate": 0.0001, + "loss": 15.5842, + "loss/crossentropy": 2.5517276525497437, + "loss/hidden": 6.0, + "loss/jsd": 0.0, + "loss/logits": 0.6455385684967041, + "step": 112 + }, + { + "epoch": 0.007125, + "grad_norm": 12.6875, + "grad_norm_var": 2.512093098958333, + "learning_rate": 0.0001, + "loss": 15.6556, + "loss/crossentropy": 2.710301995277405, + "loss/hidden": 5.84375, + "loss/jsd": 0.0, + "loss/logits": 0.7062334418296814, + "step": 114 + }, + { + "epoch": 0.00725, + "grad_norm": 13.75, + "grad_norm_var": 2.6353515625, + "learning_rate": 0.0001, + "loss": 15.3746, + "loss/crossentropy": 2.9104617834091187, + "loss/hidden": 5.859375, + "loss/jsd": 0.0, + "loss/logits": 0.6806878745555878, + "step": 116 + }, + { + "epoch": 0.007375, + "grad_norm": 10.8125, + "grad_norm_var": 2.628059895833333, + "learning_rate": 0.0001, + "loss": 15.3926, + "loss/crossentropy": 2.7969307899475098, + "loss/hidden": 5.890625, + "loss/jsd": 0.0, + "loss/logits": 0.7158068418502808, + "step": 118 + }, + { + "epoch": 0.0075, + "grad_norm": 11.375, + "grad_norm_var": 2.486962890625, + "learning_rate": 0.0001, + "loss": 15.4671, + "loss/crossentropy": 2.758000612258911, + "loss/hidden": 6.125, + "loss/jsd": 0.0, + "loss/logits": 0.692883163690567, + "step": 120 + }, + { + "epoch": 0.007625, + "grad_norm": 11.9375, + "grad_norm_var": 2.400764973958333, + "learning_rate": 0.0001, + "loss": 15.5392, + "loss/crossentropy": 2.814534068107605, + "loss/hidden": 5.75, + "loss/jsd": 0.0, + "loss/logits": 0.669840395450592, + "step": 122 + }, + { + "epoch": 0.00775, + "grad_norm": 10.875, + "grad_norm_var": 1.5585774739583333, + "learning_rate": 0.0001, + "loss": 15.0821, + "loss/crossentropy": 2.4222623109817505, + "loss/hidden": 5.8125, + "loss/jsd": 0.0, + "loss/logits": 0.6296161711215973, + "step": 124 + }, + { + "epoch": 0.007875, + "grad_norm": 9.375, + "grad_norm_var": 2.016650390625, + "learning_rate": 0.0001, + "loss": 14.7169, + "loss/crossentropy": 2.8036348819732666, + "loss/hidden": 5.6875, + "loss/jsd": 0.0, + "loss/logits": 0.5952793657779694, + "step": 126 + }, + { + "epoch": 0.008, + "grad_norm": 19.5, + "grad_norm_var": 5.5265625, + "learning_rate": 0.0001, + "loss": 15.6228, + "loss/crossentropy": 2.871894598007202, + "loss/hidden": 5.9375, + "loss/jsd": 0.0, + "loss/logits": 0.8850542902946472, + "step": 128 + }, + { + "epoch": 0.008125, + "grad_norm": 13.875, + "grad_norm_var": 5.818489583333333, + "learning_rate": 0.0001, + "loss": 15.3871, + "loss/crossentropy": 2.7824504375457764, + "loss/hidden": 5.84375, + "loss/jsd": 0.0, + "loss/logits": 0.6649284958839417, + "step": 130 + }, + { + "epoch": 0.00825, + "grad_norm": 9.75, + "grad_norm_var": 6.017122395833334, + "learning_rate": 0.0001, + "loss": 14.802, + "loss/crossentropy": 2.7720154523849487, + "loss/hidden": 5.640625, + "loss/jsd": 0.0, + "loss/logits": 0.6440402269363403, + "step": 132 + }, + { + "epoch": 0.008375, + "grad_norm": 12.0625, + "grad_norm_var": 5.979947916666666, + "learning_rate": 0.0001, + "loss": 14.896, + "loss/crossentropy": 2.4699759483337402, + "loss/hidden": 5.65625, + "loss/jsd": 0.0, + "loss/logits": 0.631425142288208, + "step": 134 + }, + { + "epoch": 0.0085, + "grad_norm": 11.8125, + "grad_norm_var": 6.357747395833333, + "learning_rate": 0.0001, + "loss": 14.7694, + "loss/crossentropy": 2.9012371301651, + "loss/hidden": 5.796875, + "loss/jsd": 0.0, + "loss/logits": 0.709007978439331, + "step": 136 + }, + { + "epoch": 0.008625, + "grad_norm": 10.625, + "grad_norm_var": 6.092171223958333, + "learning_rate": 0.0001, + "loss": 14.6342, + "loss/crossentropy": 2.7152575254440308, + "loss/hidden": 5.703125, + "loss/jsd": 0.0, + "loss/logits": 0.6243754923343658, + "step": 138 + }, + { + "epoch": 0.00875, + "grad_norm": 8.6875, + "grad_norm_var": 6.715104166666666, + "learning_rate": 0.0001, + "loss": 14.6597, + "loss/crossentropy": 2.5907901525497437, + "loss/hidden": 5.578125, + "loss/jsd": 0.0, + "loss/logits": 0.6299647688865662, + "step": 140 + }, + { + "epoch": 0.008875, + "grad_norm": 8.1875, + "grad_norm_var": 7.1994140625, + "learning_rate": 0.0001, + "loss": 13.9556, + "loss/crossentropy": 2.699749708175659, + "loss/hidden": 5.515625, + "loss/jsd": 0.0, + "loss/logits": 0.5889811217784882, + "step": 142 + }, + { + "epoch": 0.009, + "grad_norm": 8.5, + "grad_norm_var": 3.0494791666666665, + "learning_rate": 0.0001, + "loss": 13.9315, + "loss/crossentropy": 2.455536365509033, + "loss/hidden": 5.515625, + "loss/jsd": 0.0, + "loss/logits": 0.5711115598678589, + "step": 144 + }, + { + "epoch": 0.009125, + "grad_norm": 11.6875, + "grad_norm_var": 1.7934895833333333, + "learning_rate": 0.0001, + "loss": 14.3392, + "loss/crossentropy": 2.5603734254837036, + "loss/hidden": 5.515625, + "loss/jsd": 0.0, + "loss/logits": 0.6282104849815369, + "step": 146 + }, + { + "epoch": 0.00925, + "grad_norm": 9.9375, + "grad_norm_var": 1.81640625, + "learning_rate": 0.0001, + "loss": 14.155, + "loss/crossentropy": 2.7742687463760376, + "loss/hidden": 5.546875, + "loss/jsd": 0.0, + "loss/logits": 0.5838975608348846, + "step": 148 + }, + { + "epoch": 0.009375, + "grad_norm": 7.28125, + "grad_norm_var": 1.9861287434895833, + "learning_rate": 0.0001, + "loss": 14.2792, + "loss/crossentropy": 2.50667142868042, + "loss/hidden": 5.515625, + "loss/jsd": 0.0, + "loss/logits": 0.547415554523468, + "step": 150 + }, + { + "epoch": 0.0095, + "grad_norm": 12.5, + "grad_norm_var": 2.164969889322917, + "learning_rate": 0.0001, + "loss": 14.4224, + "loss/crossentropy": 2.701486349105835, + "loss/hidden": 5.4375, + "loss/jsd": 0.0, + "loss/logits": 0.567545473575592, + "step": 152 + }, + { + "epoch": 0.009625, + "grad_norm": 10.0, + "grad_norm_var": 1.96607666015625, + "learning_rate": 0.0001, + "loss": 13.5836, + "loss/crossentropy": 2.611671805381775, + "loss/hidden": 5.375, + "loss/jsd": 0.0, + "loss/logits": 0.5661961734294891, + "step": 154 + }, + { + "epoch": 0.00975, + "grad_norm": 9.125, + "grad_norm_var": 1.7756795247395833, + "learning_rate": 0.0001, + "loss": 13.775, + "loss/crossentropy": 2.598029851913452, + "loss/hidden": 5.359375, + "loss/jsd": 0.0, + "loss/logits": 0.5618497729301453, + "step": 156 + }, + { + "epoch": 0.009875, + "grad_norm": 7.9375, + "grad_norm_var": 1.92681884765625, + "learning_rate": 0.0001, + "loss": 13.5367, + "loss/crossentropy": 2.6330727338790894, + "loss/hidden": 5.40625, + "loss/jsd": 0.0, + "loss/logits": 0.5757810473442078, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 9.375, + "grad_norm_var": 1.8233683268229166, + "learning_rate": 0.0001, + "loss": 13.7555, + "loss/crossentropy": 2.6642661094665527, + "loss/hidden": 5.4375, + "loss/jsd": 0.0, + "loss/logits": 0.5554981827735901, + "step": 160 + }, + { + "epoch": 0.010125, + "grad_norm": 7.53125, + "grad_norm_var": 1.7612630208333333, + "learning_rate": 0.0001, + "loss": 13.4486, + "loss/crossentropy": 2.670701742172241, + "loss/hidden": 5.359375, + "loss/jsd": 0.0, + "loss/logits": 0.5661377012729645, + "step": 162 + }, + { + "epoch": 0.01025, + "grad_norm": 10.25, + "grad_norm_var": 1.9761678059895833, + "learning_rate": 0.0001, + "loss": 13.2185, + "loss/crossentropy": 2.4556703567504883, + "loss/hidden": 5.390625, + "loss/jsd": 0.0, + "loss/logits": 0.5137010216712952, + "step": 164 + }, + { + "epoch": 0.010375, + "grad_norm": 7.1875, + "grad_norm_var": 1.959228515625, + "learning_rate": 0.0001, + "loss": 13.2531, + "loss/crossentropy": 2.7351680994033813, + "loss/hidden": 5.328125, + "loss/jsd": 0.0, + "loss/logits": 0.5712402760982513, + "step": 166 + }, + { + "epoch": 0.0105, + "grad_norm": 10.75, + "grad_norm_var": 1.699072265625, + "learning_rate": 0.0001, + "loss": 13.6734, + "loss/crossentropy": 2.588118314743042, + "loss/hidden": 5.359375, + "loss/jsd": 0.0, + "loss/logits": 0.5905281007289886, + "step": 168 + }, + { + "epoch": 0.010625, + "grad_norm": 9.1875, + "grad_norm_var": 1.7223795572916667, + "learning_rate": 0.0001, + "loss": 13.5448, + "loss/crossentropy": 2.7176616191864014, + "loss/hidden": 5.390625, + "loss/jsd": 0.0, + "loss/logits": 0.5851459503173828, + "step": 170 + }, + { + "epoch": 0.01075, + "grad_norm": 7.21875, + "grad_norm_var": 2.4208943684895834, + "learning_rate": 0.0001, + "loss": 13.6468, + "loss/crossentropy": 2.6239798069000244, + "loss/hidden": 5.234375, + "loss/jsd": 0.0, + "loss/logits": 0.5490683615207672, + "step": 172 + }, + { + "epoch": 0.010875, + "grad_norm": 7.625, + "grad_norm_var": 2.434273274739583, + "learning_rate": 0.0001, + "loss": 13.0877, + "loss/crossentropy": 2.648572325706482, + "loss/hidden": 5.203125, + "loss/jsd": 0.0, + "loss/logits": 0.5484789907932281, + "step": 174 + }, + { + "epoch": 0.011, + "grad_norm": 8.5, + "grad_norm_var": 2.4641764322916666, + "learning_rate": 0.0001, + "loss": 12.9536, + "loss/crossentropy": 2.6549497842788696, + "loss/hidden": 5.171875, + "loss/jsd": 0.0, + "loss/logits": 0.5412751138210297, + "step": 176 + }, + { + "epoch": 0.011125, + "grad_norm": 7.84375, + "grad_norm_var": 2.4390462239583335, + "learning_rate": 0.0001, + "loss": 13.4435, + "loss/crossentropy": 2.421927332878113, + "loss/hidden": 5.15625, + "loss/jsd": 0.0, + "loss/logits": 0.555980384349823, + "step": 178 + }, + { + "epoch": 0.01125, + "grad_norm": 8.5625, + "grad_norm_var": 2.372119140625, + "learning_rate": 0.0001, + "loss": 13.2014, + "loss/crossentropy": 2.680444836616516, + "loss/hidden": 5.046875, + "loss/jsd": 0.0, + "loss/logits": 0.4972950965166092, + "step": 180 + }, + { + "epoch": 0.011375, + "grad_norm": 11.0, + "grad_norm_var": 73.80558268229167, + "learning_rate": 0.0001, + "loss": 13.4406, + "loss/crossentropy": 2.49616801738739, + "loss/hidden": 5.296875, + "loss/jsd": 0.0, + "loss/logits": 0.5274538397789001, + "step": 182 + }, + { + "epoch": 0.0115, + "grad_norm": 9.3125, + "grad_norm_var": 74.92745768229166, + "learning_rate": 0.0001, + "loss": 12.7807, + "loss/crossentropy": 2.4947917461395264, + "loss/hidden": 5.0625, + "loss/jsd": 0.0, + "loss/logits": 0.5059804916381836, + "step": 184 + }, + { + "epoch": 0.011625, + "grad_norm": 8.125, + "grad_norm_var": 74.739697265625, + "learning_rate": 0.0001, + "loss": 12.7996, + "loss/crossentropy": 2.5512574911117554, + "loss/hidden": 5.046875, + "loss/jsd": 0.0, + "loss/logits": 0.49534276127815247, + "step": 186 + }, + { + "epoch": 0.01175, + "grad_norm": 6.90625, + "grad_norm_var": 75.45623372395833, + "learning_rate": 0.0001, + "loss": 12.934, + "loss/crossentropy": 2.630277991294861, + "loss/hidden": 5.140625, + "loss/jsd": 0.0, + "loss/logits": 0.5003396719694138, + "step": 188 + }, + { + "epoch": 0.011875, + "grad_norm": 7.625, + "grad_norm_var": 76.01287434895833, + "learning_rate": 0.0001, + "loss": 12.9007, + "loss/crossentropy": 2.5264713764190674, + "loss/hidden": 5.015625, + "loss/jsd": 0.0, + "loss/logits": 0.5094788670539856, + "step": 190 + }, + { + "epoch": 0.012, + "grad_norm": 7.71875, + "grad_norm_var": 76.15037434895834, + "learning_rate": 0.0001, + "loss": 13.0658, + "loss/crossentropy": 2.704426646232605, + "loss/hidden": 5.046875, + "loss/jsd": 0.0, + "loss/logits": 0.522599458694458, + "step": 192 + }, + { + "epoch": 0.012125, + "grad_norm": 8.1875, + "grad_norm_var": 76.128125, + "learning_rate": 0.0001, + "loss": 12.9704, + "loss/crossentropy": 2.702088713645935, + "loss/hidden": 5.09375, + "loss/jsd": 0.0, + "loss/logits": 0.5415545701980591, + "step": 194 + }, + { + "epoch": 0.01225, + "grad_norm": 6.8125, + "grad_norm_var": 76.61405843098959, + "learning_rate": 0.0001, + "loss": 12.7, + "loss/crossentropy": 2.705085873603821, + "loss/hidden": 5.0625, + "loss/jsd": 0.0, + "loss/logits": 0.5454063713550568, + "step": 196 + }, + { + "epoch": 0.012375, + "grad_norm": 8.25, + "grad_norm_var": 0.5269368489583334, + "learning_rate": 0.0001, + "loss": 12.8295, + "loss/crossentropy": 2.5774309635162354, + "loss/hidden": 5.015625, + "loss/jsd": 0.0, + "loss/logits": 0.5220803320407867, + "step": 198 + }, + { + "epoch": 0.0125, + "grad_norm": 6.90625, + "grad_norm_var": 0.41435139973958335, + "learning_rate": 0.0001, + "loss": 12.6159, + "loss/crossentropy": 2.1811429262161255, + "loss/hidden": 4.9375, + "loss/jsd": 0.0, + "loss/logits": 0.4608597755432129, + "step": 200 + }, + { + "epoch": 0.012625, + "grad_norm": 7.46875, + "grad_norm_var": 0.252587890625, + "learning_rate": 0.0001, + "loss": 12.4811, + "loss/crossentropy": 2.6321566104888916, + "loss/hidden": 5.046875, + "loss/jsd": 0.0, + "loss/logits": 0.5277400612831116, + "step": 202 + }, + { + "epoch": 0.01275, + "grad_norm": 8.125, + "grad_norm_var": 0.29940999348958336, + "learning_rate": 0.0001, + "loss": 12.7234, + "loss/crossentropy": 2.4674800634384155, + "loss/hidden": 4.984375, + "loss/jsd": 0.0, + "loss/logits": 0.4902832508087158, + "step": 204 + }, + { + "epoch": 0.012875, + "grad_norm": 7.0625, + "grad_norm_var": 0.3092732747395833, + "learning_rate": 0.0001, + "loss": 12.6082, + "loss/crossentropy": 2.418899178504944, + "loss/hidden": 5.078125, + "loss/jsd": 0.0, + "loss/logits": 0.44220657646656036, + "step": 206 + }, + { + "epoch": 0.013, + "grad_norm": 8.1875, + "grad_norm_var": 0.31503499348958336, + "learning_rate": 0.0001, + "loss": 12.2736, + "loss/crossentropy": 2.5233161449432373, + "loss/hidden": 5.0, + "loss/jsd": 0.0, + "loss/logits": 0.4981265068054199, + "step": 208 + }, + { + "epoch": 0.013125, + "grad_norm": 7.71875, + "grad_norm_var": 0.35037434895833336, + "learning_rate": 0.0001, + "loss": 12.4505, + "loss/crossentropy": 2.5429080724716187, + "loss/hidden": 4.921875, + "loss/jsd": 0.0, + "loss/logits": 0.49663229286670685, + "step": 210 + }, + { + "epoch": 0.01325, + "grad_norm": 7.96875, + "grad_norm_var": 0.29889322916666666, + "learning_rate": 0.0001, + "loss": 12.9271, + "loss/crossentropy": 2.7774670124053955, + "loss/hidden": 4.96875, + "loss/jsd": 0.0, + "loss/logits": 0.5223149955272675, + "step": 212 + }, + { + "epoch": 0.013375, + "grad_norm": 6.5625, + "grad_norm_var": 0.37265218098958336, + "learning_rate": 0.0001, + "loss": 12.2444, + "loss/crossentropy": 2.550992965698242, + "loss/hidden": 4.890625, + "loss/jsd": 0.0, + "loss/logits": 0.4571031928062439, + "step": 214 + }, + { + "epoch": 0.0135, + "grad_norm": 7.53125, + "grad_norm_var": 0.29010009765625, + "learning_rate": 0.0001, + "loss": 12.1625, + "loss/crossentropy": 2.2462257146835327, + "loss/hidden": 4.875, + "loss/jsd": 0.0, + "loss/logits": 0.48005372285842896, + "step": 216 + }, + { + "epoch": 0.013625, + "grad_norm": 7.875, + "grad_norm_var": 0.298046875, + "learning_rate": 0.0001, + "loss": 12.3998, + "loss/crossentropy": 2.617794990539551, + "loss/hidden": 4.890625, + "loss/jsd": 0.0, + "loss/logits": 0.5356446206569672, + "step": 218 + }, + { + "epoch": 0.01375, + "grad_norm": 6.84375, + "grad_norm_var": 0.21796875, + "learning_rate": 0.0001, + "loss": 12.4159, + "loss/crossentropy": 2.432363748550415, + "loss/hidden": 4.90625, + "loss/jsd": 0.0, + "loss/logits": 0.5167964398860931, + "step": 220 + }, + { + "epoch": 0.013875, + "grad_norm": 7.71875, + "grad_norm_var": 0.21158447265625, + "learning_rate": 0.0001, + "loss": 12.267, + "loss/crossentropy": 2.5088655948638916, + "loss/hidden": 4.90625, + "loss/jsd": 0.0, + "loss/logits": 0.5033310353755951, + "step": 222 + }, + { + "epoch": 0.014, + "grad_norm": 8.6875, + "grad_norm_var": 0.27616780598958335, + "learning_rate": 0.0001, + "loss": 12.5268, + "loss/crossentropy": 2.463010787963867, + "loss/hidden": 4.953125, + "loss/jsd": 0.0, + "loss/logits": 0.5101533681154251, + "step": 224 + }, + { + "epoch": 0.014125, + "grad_norm": 7.40625, + "grad_norm_var": 0.2908203125, + "learning_rate": 0.0001, + "loss": 12.4375, + "loss/crossentropy": 2.7142781019210815, + "loss/hidden": 4.96875, + "loss/jsd": 0.0, + "loss/logits": 0.48157520592212677, + "step": 226 + }, + { + "epoch": 0.01425, + "grad_norm": 6.84375, + "grad_norm_var": 0.36500244140625, + "learning_rate": 0.0001, + "loss": 12.1223, + "loss/crossentropy": 2.696410059928894, + "loss/hidden": 4.859375, + "loss/jsd": 0.0, + "loss/logits": 0.4689347445964813, + "step": 228 + }, + { + "epoch": 0.014375, + "grad_norm": 7.84375, + "grad_norm_var": 0.3846354166666667, + "learning_rate": 0.0001, + "loss": 12.2157, + "loss/crossentropy": 2.5104721784591675, + "loss/hidden": 4.84375, + "loss/jsd": 0.0, + "loss/logits": 0.4914693534374237, + "step": 230 + }, + { + "epoch": 0.0145, + "grad_norm": 6.875, + "grad_norm_var": 0.41243082682291665, + "learning_rate": 0.0001, + "loss": 12.3422, + "loss/crossentropy": 2.638755679130554, + "loss/hidden": 4.953125, + "loss/jsd": 0.0, + "loss/logits": 0.5526512563228607, + "step": 232 + }, + { + "epoch": 0.014625, + "grad_norm": 6.1875, + "grad_norm_var": 0.521337890625, + "learning_rate": 0.0001, + "loss": 12.0554, + "loss/crossentropy": 2.48711097240448, + "loss/hidden": 4.828125, + "loss/jsd": 0.0, + "loss/logits": 0.4476258456707001, + "step": 234 + }, + { + "epoch": 0.01475, + "grad_norm": 8.375, + "grad_norm_var": 0.54586181640625, + "learning_rate": 0.0001, + "loss": 12.2178, + "loss/crossentropy": 2.5519078969955444, + "loss/hidden": 4.96875, + "loss/jsd": 0.0, + "loss/logits": 0.456609308719635, + "step": 236 + }, + { + "epoch": 0.014875, + "grad_norm": 7.125, + "grad_norm_var": 0.563671875, + "learning_rate": 0.0001, + "loss": 12.0904, + "loss/crossentropy": 2.4752217531204224, + "loss/hidden": 4.84375, + "loss/jsd": 0.0, + "loss/logits": 0.4934917986392975, + "step": 238 + }, + { + "epoch": 0.015, + "grad_norm": 6.34375, + "grad_norm_var": 0.5327962239583334, + "learning_rate": 0.0001, + "loss": 12.3964, + "loss/crossentropy": 2.834780216217041, + "loss/hidden": 4.921875, + "loss/jsd": 0.0, + "loss/logits": 0.5274220705032349, + "step": 240 + }, + { + "epoch": 0.015125, + "grad_norm": 10.1875, + "grad_norm_var": 1.0213826497395833, + "learning_rate": 0.0001, + "loss": 12.6241, + "loss/crossentropy": 2.6608060598373413, + "loss/hidden": 4.734375, + "loss/jsd": 0.0, + "loss/logits": 0.5230352878570557, + "step": 242 + }, + { + "epoch": 0.01525, + "grad_norm": 9.0625, + "grad_norm_var": 1.1087890625, + "learning_rate": 0.0001, + "loss": 12.4601, + "loss/crossentropy": 2.5716851949691772, + "loss/hidden": 4.8125, + "loss/jsd": 0.0, + "loss/logits": 0.5314892381429672, + "step": 244 + }, + { + "epoch": 0.015375, + "grad_norm": 7.46875, + "grad_norm_var": 1.082666015625, + "learning_rate": 0.0001, + "loss": 12.2352, + "loss/crossentropy": 2.655149459838867, + "loss/hidden": 4.8125, + "loss/jsd": 0.0, + "loss/logits": 0.4710581600666046, + "step": 246 + }, + { + "epoch": 0.0155, + "grad_norm": 5.65625, + "grad_norm_var": 1.328515625, + "learning_rate": 0.0001, + "loss": 11.692, + "loss/crossentropy": 2.1561089754104614, + "loss/hidden": 4.65625, + "loss/jsd": 0.0, + "loss/logits": 0.45925989747047424, + "step": 248 + }, + { + "epoch": 0.015625, + "grad_norm": 6.75, + "grad_norm_var": 1.30445556640625, + "learning_rate": 0.0001, + "loss": 11.7933, + "loss/crossentropy": 2.410550117492676, + "loss/hidden": 4.546875, + "loss/jsd": 0.0, + "loss/logits": 0.4565409719944, + "step": 250 + }, + { + "epoch": 0.01575, + "grad_norm": 6.84375, + "grad_norm_var": 1.2310831705729166, + "learning_rate": 0.0001, + "loss": 11.88, + "loss/crossentropy": 2.586890459060669, + "loss/hidden": 4.6875, + "loss/jsd": 0.0, + "loss/logits": 0.4949754476547241, + "step": 252 + }, + { + "epoch": 0.015875, + "grad_norm": 6.4375, + "grad_norm_var": 1.3586873372395833, + "learning_rate": 0.0001, + "loss": 12.0459, + "loss/crossentropy": 2.446923851966858, + "loss/hidden": 4.8125, + "loss/jsd": 0.0, + "loss/logits": 0.46594707667827606, + "step": 254 + }, + { + "epoch": 0.016, + "grad_norm": 6.40625, + "grad_norm_var": 1.3482381184895833, + "learning_rate": 0.0001, + "loss": 12.1629, + "loss/crossentropy": 2.57769775390625, + "loss/hidden": 4.65625, + "loss/jsd": 0.0, + "loss/logits": 0.4595105051994324, + "step": 256 + }, + { + "epoch": 0.016125, + "grad_norm": 6.625, + "grad_norm_var": 0.7177734375, + "learning_rate": 0.0001, + "loss": 12.0016, + "loss/crossentropy": 2.37365186214447, + "loss/hidden": 4.625, + "loss/jsd": 0.0, + "loss/logits": 0.4366963058710098, + "step": 258 + }, + { + "epoch": 0.01625, + "grad_norm": 6.1875, + "grad_norm_var": 0.43580322265625, + "learning_rate": 0.0001, + "loss": 11.9528, + "loss/crossentropy": 2.3685457706451416, + "loss/hidden": 4.75, + "loss/jsd": 0.0, + "loss/logits": 0.45073819160461426, + "step": 260 + }, + { + "epoch": 0.016375, + "grad_norm": 6.8125, + "grad_norm_var": 0.26324462890625, + "learning_rate": 0.0001, + "loss": 12.0193, + "loss/crossentropy": 2.349572777748108, + "loss/hidden": 4.71875, + "loss/jsd": 0.0, + "loss/logits": 0.4415210783481598, + "step": 262 + }, + { + "epoch": 0.0165, + "grad_norm": 6.4375, + "grad_norm_var": 0.204150390625, + "learning_rate": 0.0001, + "loss": 11.828, + "loss/crossentropy": 2.376862049102783, + "loss/hidden": 4.53125, + "loss/jsd": 0.0, + "loss/logits": 0.4309367835521698, + "step": 264 + }, + { + "epoch": 0.016625, + "grad_norm": 6.6875, + "grad_norm_var": 0.20725504557291666, + "learning_rate": 0.0001, + "loss": 11.6319, + "loss/crossentropy": 2.316063165664673, + "loss/hidden": 4.65625, + "loss/jsd": 0.0, + "loss/logits": 0.43444499373435974, + "step": 266 + }, + { + "epoch": 0.01675, + "grad_norm": 5.5625, + "grad_norm_var": 0.18316650390625, + "learning_rate": 0.0001, + "loss": 11.7038, + "loss/crossentropy": 2.402170777320862, + "loss/hidden": 4.65625, + "loss/jsd": 0.0, + "loss/logits": 0.44065244495868683, + "step": 268 + }, + { + "epoch": 0.016875, + "grad_norm": 6.5625, + "grad_norm_var": 0.17633056640625, + "learning_rate": 0.0001, + "loss": 11.6722, + "loss/crossentropy": 2.5016753673553467, + "loss/hidden": 4.515625, + "loss/jsd": 0.0, + "loss/logits": 0.4539669454097748, + "step": 270 + }, + { + "epoch": 0.017, + "grad_norm": 6.34375, + "grad_norm_var": 0.15442708333333333, + "learning_rate": 0.0001, + "loss": 11.756, + "loss/crossentropy": 2.344158887863159, + "loss/hidden": 4.46875, + "loss/jsd": 0.0, + "loss/logits": 0.46556878089904785, + "step": 272 + }, + { + "epoch": 0.017125, + "grad_norm": 6.25, + "grad_norm_var": 0.18977864583333334, + "learning_rate": 0.0001, + "loss": 11.4672, + "loss/crossentropy": 2.3533241748809814, + "loss/hidden": 4.5625, + "loss/jsd": 0.0, + "loss/logits": 0.42739230394363403, + "step": 274 + }, + { + "epoch": 0.01725, + "grad_norm": 8.0625, + "grad_norm_var": 0.48045247395833335, + "learning_rate": 0.0001, + "loss": 11.7908, + "loss/crossentropy": 2.360278367996216, + "loss/hidden": 4.546875, + "loss/jsd": 0.0, + "loss/logits": 0.46030762791633606, + "step": 276 + }, + { + "epoch": 0.017375, + "grad_norm": 6.5625, + "grad_norm_var": 0.45963134765625, + "learning_rate": 0.0001, + "loss": 11.8649, + "loss/crossentropy": 2.433812379837036, + "loss/hidden": 4.640625, + "loss/jsd": 0.0, + "loss/logits": 0.4665477126836777, + "step": 278 + }, + { + "epoch": 0.0175, + "grad_norm": 5.34375, + "grad_norm_var": 0.56021728515625, + "learning_rate": 0.0001, + "loss": 11.8492, + "loss/crossentropy": 2.4910370111465454, + "loss/hidden": 4.625, + "loss/jsd": 0.0, + "loss/logits": 0.4576037526130676, + "step": 280 + }, + { + "epoch": 0.017625, + "grad_norm": 7.4375, + "grad_norm_var": 0.601171875, + "learning_rate": 0.0001, + "loss": 11.7673, + "loss/crossentropy": 2.730518341064453, + "loss/hidden": 4.765625, + "loss/jsd": 0.0, + "loss/logits": 0.5011050552129745, + "step": 282 + }, + { + "epoch": 0.01775, + "grad_norm": 6.03125, + "grad_norm_var": 0.5543904622395833, + "learning_rate": 0.0001, + "loss": 11.739, + "loss/crossentropy": 2.53789222240448, + "loss/hidden": 4.625, + "loss/jsd": 0.0, + "loss/logits": 0.44618333876132965, + "step": 284 + }, + { + "epoch": 0.017875, + "grad_norm": 6.3125, + "grad_norm_var": 0.5624308268229167, + "learning_rate": 0.0001, + "loss": 12.0365, + "loss/crossentropy": 2.5072152614593506, + "loss/hidden": 4.5, + "loss/jsd": 0.0, + "loss/logits": 0.4584163427352905, + "step": 286 + }, + { + "epoch": 0.018, + "grad_norm": 7.0, + "grad_norm_var": 0.56744384765625, + "learning_rate": 0.0001, + "loss": 11.7216, + "loss/crossentropy": 2.713698387145996, + "loss/hidden": 4.65625, + "loss/jsd": 0.0, + "loss/logits": 0.47771963477134705, + "step": 288 + }, + { + "epoch": 0.018125, + "grad_norm": 5.34375, + "grad_norm_var": 0.5739420572916667, + "learning_rate": 0.0001, + "loss": 11.3088, + "loss/crossentropy": 2.4359713792800903, + "loss/hidden": 4.53125, + "loss/jsd": 0.0, + "loss/logits": 0.4259905219078064, + "step": 290 + }, + { + "epoch": 0.01825, + "grad_norm": 7.5, + "grad_norm_var": 0.36584879557291666, + "learning_rate": 0.0001, + "loss": 11.7531, + "loss/crossentropy": 2.5594223737716675, + "loss/hidden": 4.578125, + "loss/jsd": 0.0, + "loss/logits": 0.4502502828836441, + "step": 292 + }, + { + "epoch": 0.018375, + "grad_norm": 10.1875, + "grad_norm_var": 1.243994140625, + "learning_rate": 0.0001, + "loss": 12.587, + "loss/crossentropy": 2.979864239692688, + "loss/hidden": 4.90625, + "loss/jsd": 0.0, + "loss/logits": 0.6050755679607391, + "step": 294 + }, + { + "epoch": 0.0185, + "grad_norm": 6.125, + "grad_norm_var": 3.3922159830729166, + "learning_rate": 0.0001, + "loss": 12.0734, + "loss/crossentropy": 2.6472045183181763, + "loss/hidden": 4.515625, + "loss/jsd": 0.0, + "loss/logits": 0.4378223419189453, + "step": 296 + }, + { + "epoch": 0.018625, + "grad_norm": 6.125, + "grad_norm_var": 3.450972493489583, + "learning_rate": 0.0001, + "loss": 11.4956, + "loss/crossentropy": 2.5419256687164307, + "loss/hidden": 4.546875, + "loss/jsd": 0.0, + "loss/logits": 0.4189437925815582, + "step": 298 + }, + { + "epoch": 0.01875, + "grad_norm": 6.4375, + "grad_norm_var": 3.3952433268229165, + "learning_rate": 0.0001, + "loss": 11.4754, + "loss/crossentropy": 2.4922432899475098, + "loss/hidden": 4.53125, + "loss/jsd": 0.0, + "loss/logits": 0.4700406640768051, + "step": 300 + }, + { + "epoch": 0.018875, + "grad_norm": 5.8125, + "grad_norm_var": 3.46099853515625, + "learning_rate": 0.0001, + "loss": 11.4791, + "loss/crossentropy": 2.1792010068893433, + "loss/hidden": 4.390625, + "loss/jsd": 0.0, + "loss/logits": 0.4193985015153885, + "step": 302 + }, + { + "epoch": 0.019, + "grad_norm": 5.8125, + "grad_norm_var": 3.5378743489583333, + "learning_rate": 0.0001, + "loss": 11.4713, + "loss/crossentropy": 2.5582441091537476, + "loss/hidden": 4.5625, + "loss/jsd": 0.0, + "loss/logits": 0.45763692259788513, + "step": 304 + }, + { + "epoch": 0.019125, + "grad_norm": 6.78125, + "grad_norm_var": 3.42242431640625, + "learning_rate": 0.0001, + "loss": 11.5659, + "loss/crossentropy": 2.501905083656311, + "loss/hidden": 4.453125, + "loss/jsd": 0.0, + "loss/logits": 0.43473342061042786, + "step": 306 + }, + { + "epoch": 0.01925, + "grad_norm": 5.9375, + "grad_norm_var": 3.487744140625, + "learning_rate": 0.0001, + "loss": 11.6014, + "loss/crossentropy": 2.5585442781448364, + "loss/hidden": 4.625, + "loss/jsd": 0.0, + "loss/logits": 0.5095447897911072, + "step": 308 + }, + { + "epoch": 0.019375, + "grad_norm": 6.21875, + "grad_norm_var": 2.76822509765625, + "learning_rate": 0.0001, + "loss": 11.567, + "loss/crossentropy": 2.4677284955978394, + "loss/hidden": 4.546875, + "loss/jsd": 0.0, + "loss/logits": 0.4294033944606781, + "step": 310 + }, + { + "epoch": 0.0195, + "grad_norm": 6.25, + "grad_norm_var": 0.22245686848958332, + "learning_rate": 0.0001, + "loss": 11.2982, + "loss/crossentropy": 2.3961949348449707, + "loss/hidden": 4.5, + "loss/jsd": 0.0, + "loss/logits": 0.4119955450296402, + "step": 312 + }, + { + "epoch": 0.019625, + "grad_norm": 5.9375, + "grad_norm_var": 0.32304280598958335, + "learning_rate": 0.0001, + "loss": 11.5074, + "loss/crossentropy": 2.310800790786743, + "loss/hidden": 4.546875, + "loss/jsd": 0.0, + "loss/logits": 0.42905446887016296, + "step": 314 + }, + { + "epoch": 0.01975, + "grad_norm": 5.8125, + "grad_norm_var": 0.33020426432291666, + "learning_rate": 0.0001, + "loss": 11.3422, + "loss/crossentropy": 2.4780253171920776, + "loss/hidden": 4.421875, + "loss/jsd": 0.0, + "loss/logits": 0.45472322404384613, + "step": 316 + }, + { + "epoch": 0.019875, + "grad_norm": 5.9375, + "grad_norm_var": 0.33508707682291666, + "learning_rate": 0.0001, + "loss": 11.5055, + "loss/crossentropy": 2.6155530214309692, + "loss/hidden": 4.578125, + "loss/jsd": 0.0, + "loss/logits": 0.48140254616737366, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 6.5, + "grad_norm_var": 0.31060791015625, + "learning_rate": 0.0001, + "loss": 11.3301, + "loss/crossentropy": 2.5840269327163696, + "loss/hidden": 4.6875, + "loss/jsd": 0.0, + "loss/logits": 0.4504729211330414, + "step": 320 + }, + { + "epoch": 0.020125, + "grad_norm": 5.9375, + "grad_norm_var": 0.3453125, + "learning_rate": 0.0001, + "loss": 11.7327, + "loss/crossentropy": 2.5163029432296753, + "loss/hidden": 4.5, + "loss/jsd": 0.0, + "loss/logits": 0.45117421448230743, + "step": 322 + }, + { + "epoch": 0.02025, + "grad_norm": 5.375, + "grad_norm_var": 0.71353759765625, + "learning_rate": 0.0001, + "loss": 11.4897, + "loss/crossentropy": 2.4774335622787476, + "loss/hidden": 4.453125, + "loss/jsd": 0.0, + "loss/logits": 0.4575677663087845, + "step": 324 + }, + { + "epoch": 0.020375, + "grad_norm": 6.78125, + "grad_norm_var": 0.67857666015625, + "learning_rate": 0.0001, + "loss": 11.1918, + "loss/crossentropy": 2.431584596633911, + "loss/hidden": 4.390625, + "loss/jsd": 0.0, + "loss/logits": 0.4284675121307373, + "step": 326 + }, + { + "epoch": 0.0205, + "grad_norm": 6.03125, + "grad_norm_var": 0.6340779622395833, + "learning_rate": 0.0001, + "loss": 11.2356, + "loss/crossentropy": 2.2354328632354736, + "loss/hidden": 4.53125, + "loss/jsd": 0.0, + "loss/logits": 0.44321516156196594, + "step": 328 + }, + { + "epoch": 0.020625, + "grad_norm": 5.4375, + "grad_norm_var": 0.5881510416666667, + "learning_rate": 0.0001, + "loss": 11.3096, + "loss/crossentropy": 2.349725842475891, + "loss/hidden": 4.421875, + "loss/jsd": 0.0, + "loss/logits": 0.39839838445186615, + "step": 330 + }, + { + "epoch": 0.02075, + "grad_norm": 5.625, + "grad_norm_var": 0.6135050455729166, + "learning_rate": 0.0001, + "loss": 11.5636, + "loss/crossentropy": 2.704393744468689, + "loss/hidden": 4.4375, + "loss/jsd": 0.0, + "loss/logits": 0.42811933159828186, + "step": 332 + }, + { + "epoch": 0.020875, + "grad_norm": 5.71875, + "grad_norm_var": 0.6152180989583333, + "learning_rate": 0.0001, + "loss": 11.1634, + "loss/crossentropy": 2.2040624618530273, + "loss/hidden": 4.578125, + "loss/jsd": 0.0, + "loss/logits": 0.38388490676879883, + "step": 334 + }, + { + "epoch": 0.021, + "grad_norm": 7.15625, + "grad_norm_var": 0.6815104166666667, + "learning_rate": 0.0001, + "loss": 11.2019, + "loss/crossentropy": 2.334352135658264, + "loss/hidden": 4.359375, + "loss/jsd": 0.0, + "loss/logits": 0.4286506623029709, + "step": 336 + }, + { + "epoch": 0.021125, + "grad_norm": 6.21875, + "grad_norm_var": 0.6352213541666667, + "learning_rate": 0.0001, + "loss": 11.1504, + "loss/crossentropy": 2.390920877456665, + "loss/hidden": 4.328125, + "loss/jsd": 0.0, + "loss/logits": 0.4297266751527786, + "step": 338 + }, + { + "epoch": 0.02125, + "grad_norm": 5.40625, + "grad_norm_var": 0.27034098307291665, + "learning_rate": 0.0001, + "loss": 10.968, + "loss/crossentropy": 2.589638829231262, + "loss/hidden": 4.453125, + "loss/jsd": 0.0, + "loss/logits": 0.41065070033073425, + "step": 340 + }, + { + "epoch": 0.021375, + "grad_norm": 6.59375, + "grad_norm_var": 1.3688151041666667, + "learning_rate": 0.0001, + "loss": 11.5793, + "loss/crossentropy": 2.735411524772644, + "loss/hidden": 4.421875, + "loss/jsd": 0.0, + "loss/logits": 0.43197204172611237, + "step": 342 + }, + { + "epoch": 0.0215, + "grad_norm": 6.34375, + "grad_norm_var": 1.35992431640625, + "learning_rate": 0.0001, + "loss": 11.1228, + "loss/crossentropy": 2.424543857574463, + "loss/hidden": 4.375, + "loss/jsd": 0.0, + "loss/logits": 0.4435572326183319, + "step": 344 + }, + { + "epoch": 0.021625, + "grad_norm": 6.15625, + "grad_norm_var": 1.3089803059895833, + "learning_rate": 0.0001, + "loss": 11.3887, + "loss/crossentropy": 2.3145734071731567, + "loss/hidden": 4.484375, + "loss/jsd": 0.0, + "loss/logits": 0.4381801038980484, + "step": 346 + }, + { + "epoch": 0.02175, + "grad_norm": 5.90625, + "grad_norm_var": 1.2807576497395834, + "learning_rate": 0.0001, + "loss": 11.3132, + "loss/crossentropy": 2.4313782453536987, + "loss/hidden": 4.3125, + "loss/jsd": 0.0, + "loss/logits": 0.44939403235912323, + "step": 348 + }, + { + "epoch": 0.021875, + "grad_norm": 6.3125, + "grad_norm_var": 1.2580078125, + "learning_rate": 0.0001, + "loss": 11.2425, + "loss/crossentropy": 2.2621657848358154, + "loss/hidden": 4.296875, + "loss/jsd": 0.0, + "loss/logits": 0.4138593226671219, + "step": 350 + }, + { + "epoch": 0.022, + "grad_norm": 5.78125, + "grad_norm_var": 1.2151652018229167, + "learning_rate": 0.0001, + "loss": 11.4478, + "loss/crossentropy": 2.9294599294662476, + "loss/hidden": 4.40625, + "loss/jsd": 0.0, + "loss/logits": 0.43130405247211456, + "step": 352 + }, + { + "epoch": 0.022125, + "grad_norm": 5.28125, + "grad_norm_var": 1.3285807291666667, + "learning_rate": 0.0001, + "loss": 11.0711, + "loss/crossentropy": 2.4152419567108154, + "loss/hidden": 4.453125, + "loss/jsd": 0.0, + "loss/logits": 0.4246339052915573, + "step": 354 + }, + { + "epoch": 0.02225, + "grad_norm": 5.28125, + "grad_norm_var": 1.4159993489583333, + "learning_rate": 0.0001, + "loss": 10.4805, + "loss/crossentropy": 2.342584490776062, + "loss/hidden": 4.3125, + "loss/jsd": 0.0, + "loss/logits": 0.39343252778053284, + "step": 356 + }, + { + "epoch": 0.022375, + "grad_norm": 5.3125, + "grad_norm_var": 0.24654947916666667, + "learning_rate": 0.0001, + "loss": 10.7117, + "loss/crossentropy": 2.268111824989319, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.37643152475357056, + "step": 358 + }, + { + "epoch": 0.0225, + "grad_norm": 5.4375, + "grad_norm_var": 0.26873372395833334, + "learning_rate": 0.0001, + "loss": 10.8794, + "loss/crossentropy": 2.2588669061660767, + "loss/hidden": 4.203125, + "loss/jsd": 0.0, + "loss/logits": 0.38288983702659607, + "step": 360 + }, + { + "epoch": 0.022625, + "grad_norm": 5.46875, + "grad_norm_var": 0.2638631184895833, + "learning_rate": 0.0001, + "loss": 11.0366, + "loss/crossentropy": 2.458423137664795, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.42026595771312714, + "step": 362 + }, + { + "epoch": 0.02275, + "grad_norm": 6.8125, + "grad_norm_var": 0.31164957682291666, + "learning_rate": 0.0001, + "loss": 11.3242, + "loss/crossentropy": 2.321816086769104, + "loss/hidden": 4.28125, + "loss/jsd": 0.0, + "loss/logits": 0.3996615409851074, + "step": 364 + }, + { + "epoch": 0.022875, + "grad_norm": 5.4375, + "grad_norm_var": 0.30230712890625, + "learning_rate": 0.0001, + "loss": 10.7526, + "loss/crossentropy": 2.370081901550293, + "loss/hidden": 4.234375, + "loss/jsd": 0.0, + "loss/logits": 0.39483100175857544, + "step": 366 + }, + { + "epoch": 0.023, + "grad_norm": 5.40625, + "grad_norm_var": 0.22395833333333334, + "learning_rate": 0.0001, + "loss": 10.7775, + "loss/crossentropy": 2.292533278465271, + "loss/hidden": 4.234375, + "loss/jsd": 0.0, + "loss/logits": 0.4388478994369507, + "step": 368 + }, + { + "epoch": 0.023125, + "grad_norm": 5.5, + "grad_norm_var": 0.22610270182291667, + "learning_rate": 0.0001, + "loss": 10.8841, + "loss/crossentropy": 2.3297876119613647, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.4065796434879303, + "step": 370 + }, + { + "epoch": 0.02325, + "grad_norm": 6.125, + "grad_norm_var": 0.23635660807291667, + "learning_rate": 0.0001, + "loss": 10.7713, + "loss/crossentropy": 2.5502147674560547, + "loss/hidden": 4.296875, + "loss/jsd": 0.0, + "loss/logits": 0.3998092859983444, + "step": 372 + }, + { + "epoch": 0.023375, + "grad_norm": 5.15625, + "grad_norm_var": 0.23821614583333334, + "learning_rate": 0.0001, + "loss": 11.0205, + "loss/crossentropy": 2.385592818260193, + "loss/hidden": 4.296875, + "loss/jsd": 0.0, + "loss/logits": 0.4194333106279373, + "step": 374 + }, + { + "epoch": 0.0235, + "grad_norm": 6.09375, + "grad_norm_var": 0.23645833333333333, + "learning_rate": 0.0001, + "loss": 10.8672, + "loss/crossentropy": 2.4565255641937256, + "loss/hidden": 4.3125, + "loss/jsd": 0.0, + "loss/logits": 0.4082058221101761, + "step": 376 + }, + { + "epoch": 0.023625, + "grad_norm": 5.28125, + "grad_norm_var": 0.27864176432291665, + "learning_rate": 0.0001, + "loss": 11.2051, + "loss/crossentropy": 2.621595621109009, + "loss/hidden": 4.484375, + "loss/jsd": 0.0, + "loss/logits": 0.44046278297901154, + "step": 378 + }, + { + "epoch": 0.02375, + "grad_norm": 5.875, + "grad_norm_var": 0.18631184895833333, + "learning_rate": 0.0001, + "loss": 11.0582, + "loss/crossentropy": 2.856778144836426, + "loss/hidden": 4.328125, + "loss/jsd": 0.0, + "loss/logits": 0.42068275809288025, + "step": 380 + }, + { + "epoch": 0.023875, + "grad_norm": 5.96875, + "grad_norm_var": 0.18183186848958333, + "learning_rate": 0.0001, + "loss": 10.952, + "loss/crossentropy": 2.5854907035827637, + "loss/hidden": 4.265625, + "loss/jsd": 0.0, + "loss/logits": 0.40150247514247894, + "step": 382 + }, + { + "epoch": 0.024, + "grad_norm": 5.8125, + "grad_norm_var": 0.17877604166666666, + "learning_rate": 0.0001, + "loss": 10.7008, + "loss/crossentropy": 2.3077510595321655, + "loss/hidden": 4.234375, + "loss/jsd": 0.0, + "loss/logits": 0.39931294322013855, + "step": 384 + }, + { + "epoch": 0.024125, + "grad_norm": 5.40625, + "grad_norm_var": 0.172509765625, + "learning_rate": 0.0001, + "loss": 10.931, + "loss/crossentropy": 2.533818006515503, + "loss/hidden": 4.3125, + "loss/jsd": 0.0, + "loss/logits": 0.43096111714839935, + "step": 386 + }, + { + "epoch": 0.02425, + "grad_norm": 6.5625, + "grad_norm_var": 0.21328125, + "learning_rate": 0.0001, + "loss": 10.9486, + "loss/crossentropy": 2.2463923692703247, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.4105375409126282, + "step": 388 + }, + { + "epoch": 0.024375, + "grad_norm": 6.40625, + "grad_norm_var": 0.21083577473958334, + "learning_rate": 0.0001, + "loss": 10.983, + "loss/crossentropy": 2.6630618572235107, + "loss/hidden": 4.296875, + "loss/jsd": 0.0, + "loss/logits": 0.4139321595430374, + "step": 390 + }, + { + "epoch": 0.0245, + "grad_norm": 4.8125, + "grad_norm_var": 0.24898681640625, + "learning_rate": 0.0001, + "loss": 10.6566, + "loss/crossentropy": 2.309110641479492, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.35926803946495056, + "step": 392 + }, + { + "epoch": 0.024625, + "grad_norm": 5.4375, + "grad_norm_var": 0.21187744140625, + "learning_rate": 0.0001, + "loss": 10.6611, + "loss/crossentropy": 2.554847478866577, + "loss/hidden": 4.265625, + "loss/jsd": 0.0, + "loss/logits": 0.4162745624780655, + "step": 394 + }, + { + "epoch": 0.02475, + "grad_norm": 5.21875, + "grad_norm_var": 0.21630452473958334, + "learning_rate": 0.0001, + "loss": 10.5844, + "loss/crossentropy": 2.6489609479904175, + "loss/hidden": 4.265625, + "loss/jsd": 0.0, + "loss/logits": 0.40658123791217804, + "step": 396 + }, + { + "epoch": 0.024875, + "grad_norm": 5.03125, + "grad_norm_var": 0.21614176432291668, + "learning_rate": 0.0001, + "loss": 10.7177, + "loss/crossentropy": 2.542907238006592, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.3921196609735489, + "step": 398 + }, + { + "epoch": 0.025, + "grad_norm": 5.34375, + "grad_norm_var": 0.21314697265625, + "learning_rate": 0.0001, + "loss": 11.1447, + "loss/crossentropy": 2.7548632621765137, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.42924974858760834, + "step": 400 + }, + { + "epoch": 0.025125, + "grad_norm": 7.40625, + "grad_norm_var": 0.46285400390625, + "learning_rate": 0.0001, + "loss": 10.7675, + "loss/crossentropy": 2.46126389503479, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.39891795814037323, + "step": 402 + }, + { + "epoch": 0.02525, + "grad_norm": 6.0, + "grad_norm_var": 0.4099609375, + "learning_rate": 0.0001, + "loss": 10.6513, + "loss/crossentropy": 2.4365785121917725, + "loss/hidden": 4.25, + "loss/jsd": 0.0, + "loss/logits": 0.38683582842350006, + "step": 404 + }, + { + "epoch": 0.025375, + "grad_norm": 5.1875, + "grad_norm_var": 0.379931640625, + "learning_rate": 0.0001, + "loss": 10.8399, + "loss/crossentropy": 2.379759669303894, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.39575886726379395, + "step": 406 + }, + { + "epoch": 0.0255, + "grad_norm": 5.65625, + "grad_norm_var": 0.367822265625, + "learning_rate": 0.0001, + "loss": 10.6569, + "loss/crossentropy": 2.29716956615448, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.4569971561431885, + "step": 408 + }, + { + "epoch": 0.025625, + "grad_norm": 5.03125, + "grad_norm_var": 0.39303385416666664, + "learning_rate": 0.0001, + "loss": 10.9526, + "loss/crossentropy": 2.5199685096740723, + "loss/hidden": 4.390625, + "loss/jsd": 0.0, + "loss/logits": 0.46622008085250854, + "step": 410 + }, + { + "epoch": 0.02575, + "grad_norm": 5.03125, + "grad_norm_var": 0.40784098307291666, + "learning_rate": 0.0001, + "loss": 10.6827, + "loss/crossentropy": 2.633329153060913, + "loss/hidden": 4.1875, + "loss/jsd": 0.0, + "loss/logits": 0.3798275887966156, + "step": 412 + }, + { + "epoch": 0.025875, + "grad_norm": 5.9375, + "grad_norm_var": 0.42083333333333334, + "learning_rate": 0.0001, + "loss": 10.9265, + "loss/crossentropy": 2.690458655357361, + "loss/hidden": 4.265625, + "loss/jsd": 0.0, + "loss/logits": 0.4021689295768738, + "step": 414 + }, + { + "epoch": 0.026, + "grad_norm": 5.0625, + "grad_norm_var": 0.45126546223958336, + "learning_rate": 0.0001, + "loss": 10.9415, + "loss/crossentropy": 2.5202553272247314, + "loss/hidden": 4.203125, + "loss/jsd": 0.0, + "loss/logits": 0.41354137659072876, + "step": 416 + }, + { + "epoch": 0.026125, + "grad_norm": 5.96875, + "grad_norm_var": 0.19875895182291667, + "learning_rate": 0.0001, + "loss": 10.868, + "loss/crossentropy": 2.6249037981033325, + "loss/hidden": 4.28125, + "loss/jsd": 0.0, + "loss/logits": 0.4118567407131195, + "step": 418 + }, + { + "epoch": 0.02625, + "grad_norm": 4.625, + "grad_norm_var": 0.20584309895833333, + "learning_rate": 0.0001, + "loss": 10.6141, + "loss/crossentropy": 2.4828044176101685, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.3660377264022827, + "step": 420 + }, + { + "epoch": 0.026375, + "grad_norm": 5.1875, + "grad_norm_var": 0.20572916666666666, + "learning_rate": 0.0001, + "loss": 10.8148, + "loss/crossentropy": 2.5954415798187256, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.36801303923130035, + "step": 422 + }, + { + "epoch": 0.0265, + "grad_norm": 5.5, + "grad_norm_var": 0.18694254557291667, + "learning_rate": 0.0001, + "loss": 10.6899, + "loss/crossentropy": 2.2876476049423218, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.37484128773212433, + "step": 424 + }, + { + "epoch": 0.026625, + "grad_norm": 5.4375, + "grad_norm_var": 0.14928385416666667, + "learning_rate": 0.0001, + "loss": 10.92, + "loss/crossentropy": 2.5665252208709717, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.42709940671920776, + "step": 426 + }, + { + "epoch": 0.02675, + "grad_norm": 5.25, + "grad_norm_var": 0.16708577473958333, + "learning_rate": 0.0001, + "loss": 10.9522, + "loss/crossentropy": 2.5319453477859497, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.4047371447086334, + "step": 428 + }, + { + "epoch": 0.026875, + "grad_norm": 6.28125, + "grad_norm_var": 0.18268229166666666, + "learning_rate": 0.0001, + "loss": 10.7146, + "loss/crossentropy": 2.5017552375793457, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.40194234251976013, + "step": 430 + }, + { + "epoch": 0.027, + "grad_norm": 4.96875, + "grad_norm_var": 0.20439046223958332, + "learning_rate": 0.0001, + "loss": 10.5358, + "loss/crossentropy": 2.3956456184387207, + "loss/hidden": 4.203125, + "loss/jsd": 0.0, + "loss/logits": 0.3840651959180832, + "step": 432 + }, + { + "epoch": 0.027125, + "grad_norm": 5.4375, + "grad_norm_var": 0.19501546223958333, + "learning_rate": 0.0001, + "loss": 10.8501, + "loss/crossentropy": 2.5979639291763306, + "loss/hidden": 4.40625, + "loss/jsd": 0.0, + "loss/logits": 0.4386890381574631, + "step": 434 + }, + { + "epoch": 0.02725, + "grad_norm": 5.5, + "grad_norm_var": 0.16083577473958333, + "learning_rate": 0.0001, + "loss": 10.6397, + "loss/crossentropy": 2.707968592643738, + "loss/hidden": 4.1875, + "loss/jsd": 0.0, + "loss/logits": 0.3836878836154938, + "step": 436 + }, + { + "epoch": 0.027375, + "grad_norm": 4.78125, + "grad_norm_var": 0.18684895833333334, + "learning_rate": 0.0001, + "loss": 10.8105, + "loss/crossentropy": 2.6276891231536865, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.40945254266262054, + "step": 438 + }, + { + "epoch": 0.0275, + "grad_norm": 5.0625, + "grad_norm_var": 0.20623372395833334, + "learning_rate": 0.0001, + "loss": 10.1525, + "loss/crossentropy": 2.269914984703064, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.36995893716812134, + "step": 440 + }, + { + "epoch": 0.027625, + "grad_norm": 5.1875, + "grad_norm_var": 0.20367431640625, + "learning_rate": 0.0001, + "loss": 10.59, + "loss/crossentropy": 2.4260438680648804, + "loss/hidden": 4.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.40428994596004486, + "step": 442 + }, + { + "epoch": 0.02775, + "grad_norm": 5.0625, + "grad_norm_var": 0.16868489583333332, + "learning_rate": 0.0001, + "loss": 10.6862, + "loss/crossentropy": 2.319399118423462, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.3804202526807785, + "step": 444 + }, + { + "epoch": 0.027875, + "grad_norm": 5.8125, + "grad_norm_var": 0.11953125, + "learning_rate": 0.0001, + "loss": 10.7438, + "loss/crossentropy": 2.556222915649414, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.3961835205554962, + "step": 446 + }, + { + "epoch": 0.028, + "grad_norm": 4.78125, + "grad_norm_var": 0.12519124348958333, + "learning_rate": 0.0001, + "loss": 10.7723, + "loss/crossentropy": 2.5780078172683716, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.4192471206188202, + "step": 448 + }, + { + "epoch": 0.028125, + "grad_norm": 5.8125, + "grad_norm_var": 0.124462890625, + "learning_rate": 0.0001, + "loss": 10.7281, + "loss/crossentropy": 2.4442174434661865, + "loss/hidden": 4.34375, + "loss/jsd": 0.0, + "loss/logits": 0.44460536539554596, + "step": 450 + }, + { + "epoch": 0.02825, + "grad_norm": 5.0, + "grad_norm_var": 0.12102864583333334, + "learning_rate": 0.0001, + "loss": 10.5439, + "loss/crossentropy": 2.3095200061798096, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3880293220281601, + "step": 452 + }, + { + "epoch": 0.028375, + "grad_norm": 5.03125, + "grad_norm_var": 0.11168212890625, + "learning_rate": 0.0001, + "loss": 10.4828, + "loss/crossentropy": 2.263134002685547, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.4143233299255371, + "step": 454 + }, + { + "epoch": 0.0285, + "grad_norm": 5.40625, + "grad_norm_var": 0.10038655598958333, + "learning_rate": 0.0001, + "loss": 10.7777, + "loss/crossentropy": 2.4948445558547974, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.4150776267051697, + "step": 456 + }, + { + "epoch": 0.028625, + "grad_norm": 4.5, + "grad_norm_var": 0.14049072265625, + "learning_rate": 0.0001, + "loss": 10.2437, + "loss/crossentropy": 2.6246066093444824, + "loss/hidden": 4.1875, + "loss/jsd": 0.0, + "loss/logits": 0.4077821969985962, + "step": 458 + }, + { + "epoch": 0.02875, + "grad_norm": 4.8125, + "grad_norm_var": 0.19472249348958334, + "learning_rate": 0.0001, + "loss": 10.701, + "loss/crossentropy": 2.5822086334228516, + "loss/hidden": 4.21875, + "loss/jsd": 0.0, + "loss/logits": 0.390767902135849, + "step": 460 + }, + { + "epoch": 0.028875, + "grad_norm": 4.96875, + "grad_norm_var": 0.173046875, + "learning_rate": 0.0001, + "loss": 10.7852, + "loss/crossentropy": 2.5109020471572876, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.35967275500297546, + "step": 462 + }, + { + "epoch": 0.029, + "grad_norm": 5.0, + "grad_norm_var": 0.15071207682291668, + "learning_rate": 0.0001, + "loss": 10.6219, + "loss/crossentropy": 2.407975435256958, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3790464997291565, + "step": 464 + }, + { + "epoch": 0.029125, + "grad_norm": 5.53125, + "grad_norm_var": 0.13730061848958333, + "learning_rate": 0.0001, + "loss": 10.3357, + "loss/crossentropy": 2.564648985862732, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.3769105225801468, + "step": 466 + }, + { + "epoch": 0.02925, + "grad_norm": 4.96875, + "grad_norm_var": 0.1544921875, + "learning_rate": 0.0001, + "loss": 10.566, + "loss/crossentropy": 2.4403003454208374, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.3570362627506256, + "step": 468 + }, + { + "epoch": 0.029375, + "grad_norm": 4.53125, + "grad_norm_var": 0.18782145182291668, + "learning_rate": 0.0001, + "loss": 10.4887, + "loss/crossentropy": 2.3666934967041016, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.40434208512306213, + "step": 470 + }, + { + "epoch": 0.0295, + "grad_norm": 5.46875, + "grad_norm_var": 0.189697265625, + "learning_rate": 0.0001, + "loss": 10.7792, + "loss/crossentropy": 2.679360866546631, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.383465513586998, + "step": 472 + }, + { + "epoch": 0.029625, + "grad_norm": 6.09375, + "grad_norm_var": 1.400390625, + "learning_rate": 0.0001, + "loss": 10.7712, + "loss/crossentropy": 2.700055718421936, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.412184476852417, + "step": 474 + }, + { + "epoch": 0.02975, + "grad_norm": 5.21875, + "grad_norm_var": 1.6235026041666667, + "learning_rate": 0.0001, + "loss": 10.6595, + "loss/crossentropy": 2.456274390220642, + "loss/hidden": 4.15625, + "loss/jsd": 0.0, + "loss/logits": 0.40182630717754364, + "step": 476 + }, + { + "epoch": 0.029875, + "grad_norm": 4.78125, + "grad_norm_var": 1.6465983072916666, + "learning_rate": 0.0001, + "loss": 10.7663, + "loss/crossentropy": 2.4097973108291626, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.3738597333431244, + "step": 478 + }, + { + "epoch": 0.03, + "grad_norm": 5.8125, + "grad_norm_var": 1.5962076822916667, + "learning_rate": 0.0001, + "loss": 10.7901, + "loss/crossentropy": 2.4475165605545044, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.40748435258865356, + "step": 480 + }, + { + "epoch": 0.030125, + "grad_norm": 4.75, + "grad_norm_var": 1.6552042643229166, + "learning_rate": 0.0001, + "loss": 10.1387, + "loss/crossentropy": 2.28298556804657, + "loss/hidden": 3.96875, + "loss/jsd": 0.0, + "loss/logits": 0.36281222105026245, + "step": 482 + }, + { + "epoch": 0.03025, + "grad_norm": 5.9375, + "grad_norm_var": 1.6994425455729167, + "learning_rate": 0.0001, + "loss": 10.316, + "loss/crossentropy": 2.3355804681777954, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.34566931426525116, + "step": 484 + }, + { + "epoch": 0.030375, + "grad_norm": 4.8125, + "grad_norm_var": 1.6235026041666667, + "learning_rate": 0.0001, + "loss": 10.6935, + "loss/crossentropy": 2.6970983743667603, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.4392566382884979, + "step": 486 + }, + { + "epoch": 0.0305, + "grad_norm": 7.625, + "grad_norm_var": 1.9055826822916666, + "learning_rate": 0.0001, + "loss": 10.5326, + "loss/crossentropy": 2.4185194969177246, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.3976728916168213, + "step": 488 + }, + { + "epoch": 0.030625, + "grad_norm": 5.09375, + "grad_norm_var": 0.86724853515625, + "learning_rate": 0.0001, + "loss": 10.5936, + "loss/crossentropy": 2.390311121940613, + "loss/hidden": 4.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.379798486828804, + "step": 490 + }, + { + "epoch": 0.03075, + "grad_norm": 4.71875, + "grad_norm_var": 0.5812337239583333, + "learning_rate": 0.0001, + "loss": 10.4348, + "loss/crossentropy": 2.5370208024978638, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.3812776803970337, + "step": 492 + }, + { + "epoch": 0.030875, + "grad_norm": 4.53125, + "grad_norm_var": 0.59869384765625, + "learning_rate": 0.0001, + "loss": 10.2674, + "loss/crossentropy": 2.400723934173584, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3432563245296478, + "step": 494 + }, + { + "epoch": 0.031, + "grad_norm": 4.71875, + "grad_norm_var": 0.575634765625, + "learning_rate": 0.0001, + "loss": 10.3094, + "loss/crossentropy": 2.44759202003479, + "loss/hidden": 3.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.3646901249885559, + "step": 496 + }, + { + "epoch": 0.031125, + "grad_norm": 5.71875, + "grad_norm_var": 0.5779296875, + "learning_rate": 0.0001, + "loss": 10.6323, + "loss/crossentropy": 2.4714183807373047, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.4395069479942322, + "step": 498 + }, + { + "epoch": 0.03125, + "grad_norm": 4.875, + "grad_norm_var": 0.5239420572916667, + "learning_rate": 0.0001, + "loss": 10.5234, + "loss/crossentropy": 2.425844192504883, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3750711977481842, + "step": 500 + }, + { + "epoch": 0.031375, + "grad_norm": 4.8125, + "grad_norm_var": 0.5263631184895833, + "learning_rate": 0.0001, + "loss": 10.0027, + "loss/crossentropy": 2.187627673149109, + "loss/hidden": 3.9375, + "loss/jsd": 0.0, + "loss/logits": 0.3448432832956314, + "step": 502 + }, + { + "epoch": 0.0315, + "grad_norm": 4.75, + "grad_norm_var": 0.08319905598958334, + "learning_rate": 0.0001, + "loss": 10.3973, + "loss/crossentropy": 2.592836856842041, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3808598816394806, + "step": 504 + }, + { + "epoch": 0.031625, + "grad_norm": 4.78125, + "grad_norm_var": 0.086181640625, + "learning_rate": 0.0001, + "loss": 10.3901, + "loss/crossentropy": 2.621356964111328, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.37457969784736633, + "step": 506 + }, + { + "epoch": 0.03175, + "grad_norm": 13.375, + "grad_norm_var": 4.523758951822916, + "learning_rate": 0.0001, + "loss": 10.7639, + "loss/crossentropy": 2.4040629863739014, + "loss/hidden": 4.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.40050315856933594, + "step": 508 + }, + { + "epoch": 0.031875, + "grad_norm": 6.40625, + "grad_norm_var": 4.598368326822917, + "learning_rate": 0.0001, + "loss": 10.5925, + "loss/crossentropy": 2.6625452041625977, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.42064009606838226, + "step": 510 + }, + { + "epoch": 0.032, + "grad_norm": 4.6875, + "grad_norm_var": 4.583463541666666, + "learning_rate": 0.0001, + "loss": 10.6113, + "loss/crossentropy": 2.3583052158355713, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.40322598814964294, + "step": 512 + }, + { + "epoch": 0.032125, + "grad_norm": 5.25, + "grad_norm_var": 4.567867024739583, + "learning_rate": 0.0001, + "loss": 10.4471, + "loss/crossentropy": 2.333785891532898, + "loss/hidden": 4.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.37443122267723083, + "step": 514 + }, + { + "epoch": 0.03225, + "grad_norm": 4.65625, + "grad_norm_var": 4.6419921875, + "learning_rate": 0.0001, + "loss": 10.1493, + "loss/crossentropy": 2.500606060028076, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.3583361357450485, + "step": 516 + }, + { + "epoch": 0.032375, + "grad_norm": 4.78125, + "grad_norm_var": 4.674247233072917, + "learning_rate": 0.0001, + "loss": 10.1221, + "loss/crossentropy": 2.201894164085388, + "loss/hidden": 4.046875, + "loss/jsd": 0.0, + "loss/logits": 0.35936446487903595, + "step": 518 + }, + { + "epoch": 0.0325, + "grad_norm": 4.625, + "grad_norm_var": 4.658784993489584, + "learning_rate": 0.0001, + "loss": 10.7879, + "loss/crossentropy": 2.422861099243164, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.4262392073869705, + "step": 520 + }, + { + "epoch": 0.032625, + "grad_norm": 6.15625, + "grad_norm_var": 4.619755045572917, + "learning_rate": 0.0001, + "loss": 10.5335, + "loss/crossentropy": 2.609155774116516, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.40598247945308685, + "step": 522 + }, + { + "epoch": 0.03275, + "grad_norm": 4.9375, + "grad_norm_var": 0.5051920572916667, + "learning_rate": 0.0001, + "loss": 10.2853, + "loss/crossentropy": 2.4510494470596313, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.4115123152732849, + "step": 524 + }, + { + "epoch": 0.032875, + "grad_norm": 4.65625, + "grad_norm_var": 0.26568603515625, + "learning_rate": 0.0001, + "loss": 10.7524, + "loss/crossentropy": 2.604946732521057, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.41282832622528076, + "step": 526 + }, + { + "epoch": 0.033, + "grad_norm": 10.4375, + "grad_norm_var": 2.0192545572916667, + "learning_rate": 0.0001, + "loss": 10.4002, + "loss/crossentropy": 2.611866593360901, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.37592028081417084, + "step": 528 + }, + { + "epoch": 0.033125, + "grad_norm": 5.0, + "grad_norm_var": 2.0321451822916665, + "learning_rate": 0.0001, + "loss": 10.4279, + "loss/crossentropy": 2.599483370780945, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.37497493624687195, + "step": 530 + }, + { + "epoch": 0.03325, + "grad_norm": 4.6875, + "grad_norm_var": 2.026497395833333, + "learning_rate": 0.0001, + "loss": 10.1858, + "loss/crossentropy": 2.2957273721694946, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.36572718620300293, + "step": 532 + }, + { + "epoch": 0.033375, + "grad_norm": 4.8125, + "grad_norm_var": 1.99713134765625, + "learning_rate": 0.0001, + "loss": 10.4544, + "loss/crossentropy": 2.44324791431427, + "loss/hidden": 4.171875, + "loss/jsd": 0.0, + "loss/logits": 0.4093717336654663, + "step": 534 + }, + { + "epoch": 0.0335, + "grad_norm": 4.84375, + "grad_norm_var": 1.9977701822916667, + "learning_rate": 0.0001, + "loss": 10.2236, + "loss/crossentropy": 2.415123224258423, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.36705660820007324, + "step": 536 + }, + { + "epoch": 0.033625, + "grad_norm": 4.875, + "grad_norm_var": 1.9493326822916666, + "learning_rate": 0.0001, + "loss": 10.5376, + "loss/crossentropy": 2.466187596321106, + "loss/hidden": 4.09375, + "loss/jsd": 0.0, + "loss/logits": 0.40770016610622406, + "step": 538 + }, + { + "epoch": 0.03375, + "grad_norm": 6.03125, + "grad_norm_var": 1.965478515625, + "learning_rate": 0.0001, + "loss": 10.4253, + "loss/crossentropy": 2.379727602005005, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.3828308582305908, + "step": 540 + }, + { + "epoch": 0.033875, + "grad_norm": 5.5625, + "grad_norm_var": 1.9148274739583333, + "learning_rate": 0.0001, + "loss": 10.4729, + "loss/crossentropy": 2.502163052558899, + "loss/hidden": 4.0625, + "loss/jsd": 0.0, + "loss/logits": 0.43099866807460785, + "step": 542 + }, + { + "epoch": 0.034, + "grad_norm": 4.9375, + "grad_norm_var": 0.15963541666666667, + "learning_rate": 0.0001, + "loss": 10.353, + "loss/crossentropy": 2.501845955848694, + "loss/hidden": 3.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.3675364851951599, + "step": 544 + }, + { + "epoch": 0.034125, + "grad_norm": 5.40625, + "grad_norm_var": 0.18144124348958332, + "learning_rate": 0.0001, + "loss": 10.2907, + "loss/crossentropy": 2.376683473587036, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.41435733437538147, + "step": 546 + }, + { + "epoch": 0.03425, + "grad_norm": 5.34375, + "grad_norm_var": 0.17939046223958333, + "learning_rate": 0.0001, + "loss": 10.2278, + "loss/crossentropy": 2.230885148048401, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.3870948702096939, + "step": 548 + }, + { + "epoch": 0.034375, + "grad_norm": 4.96875, + "grad_norm_var": 0.18331705729166667, + "learning_rate": 0.0001, + "loss": 10.0963, + "loss/crossentropy": 2.3854016065597534, + "loss/hidden": 3.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.3725956082344055, + "step": 550 + }, + { + "epoch": 0.0345, + "grad_norm": 4.8125, + "grad_norm_var": 0.18017171223958334, + "learning_rate": 0.0001, + "loss": 10.4064, + "loss/crossentropy": 2.30772066116333, + "loss/hidden": 4.0625, + "loss/jsd": 0.0, + "loss/logits": 0.41719433665275574, + "step": 552 + }, + { + "epoch": 0.034625, + "grad_norm": 4.9375, + "grad_norm_var": 0.19784749348958333, + "learning_rate": 0.0001, + "loss": 9.9904, + "loss/crossentropy": 2.4518308639526367, + "loss/hidden": 3.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.367490217089653, + "step": 554 + }, + { + "epoch": 0.03475, + "grad_norm": 4.65625, + "grad_norm_var": 0.145166015625, + "learning_rate": 0.0001, + "loss": 10.186, + "loss/crossentropy": 2.5720479488372803, + "loss/hidden": 3.96875, + "loss/jsd": 0.0, + "loss/logits": 0.36091138422489166, + "step": 556 + }, + { + "epoch": 0.034875, + "grad_norm": 4.65625, + "grad_norm_var": 0.09309488932291667, + "learning_rate": 0.0001, + "loss": 10.0815, + "loss/crossentropy": 2.4808801412582397, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3723009526729584, + "step": 558 + }, + { + "epoch": 0.035, + "grad_norm": 4.71875, + "grad_norm_var": 0.09099934895833334, + "learning_rate": 0.0001, + "loss": 10.0476, + "loss/crossentropy": 2.4010642766952515, + "loss/hidden": 3.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.3671903610229492, + "step": 560 + }, + { + "epoch": 0.035125, + "grad_norm": 5.375, + "grad_norm_var": 0.08674723307291667, + "learning_rate": 0.0001, + "loss": 10.4371, + "loss/crossentropy": 2.4921680688858032, + "loss/hidden": 4.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.4616352915763855, + "step": 562 + }, + { + "epoch": 0.03525, + "grad_norm": 4.6875, + "grad_norm_var": 0.049540201822916664, + "learning_rate": 0.0001, + "loss": 10.2687, + "loss/crossentropy": 2.5980935096740723, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.4135672152042389, + "step": 564 + }, + { + "epoch": 0.035375, + "grad_norm": 5.0625, + "grad_norm_var": 0.053446451822916664, + "learning_rate": 0.0001, + "loss": 10.2881, + "loss/crossentropy": 2.515305280685425, + "loss/hidden": 3.875, + "loss/jsd": 0.0, + "loss/logits": 0.3722418546676636, + "step": 566 + }, + { + "epoch": 0.0355, + "grad_norm": 4.65625, + "grad_norm_var": 0.06464436848958334, + "learning_rate": 0.0001, + "loss": 10.2975, + "loss/crossentropy": 2.559085965156555, + "loss/hidden": 4.125, + "loss/jsd": 0.0, + "loss/logits": 0.42906875908374786, + "step": 568 + }, + { + "epoch": 0.035625, + "grad_norm": 4.3125, + "grad_norm_var": 0.08541666666666667, + "learning_rate": 0.0001, + "loss": 9.8115, + "loss/crossentropy": 2.2619433403015137, + "loss/hidden": 3.984375, + "loss/jsd": 0.0, + "loss/logits": 0.3420299142599106, + "step": 570 + }, + { + "epoch": 0.03575, + "grad_norm": 4.5, + "grad_norm_var": 0.14739583333333334, + "learning_rate": 0.0001, + "loss": 10.4233, + "loss/crossentropy": 2.5262789726257324, + "loss/hidden": 3.953125, + "loss/jsd": 0.0, + "loss/logits": 0.39023733139038086, + "step": 572 + }, + { + "epoch": 0.035875, + "grad_norm": 4.90625, + "grad_norm_var": 0.30859375, + "learning_rate": 0.0001, + "loss": 10.3246, + "loss/crossentropy": 2.567444682121277, + "loss/hidden": 3.984375, + "loss/jsd": 0.0, + "loss/logits": 0.3690713047981262, + "step": 574 + }, + { + "epoch": 0.036, + "grad_norm": 4.96875, + "grad_norm_var": 0.30383707682291666, + "learning_rate": 0.0001, + "loss": 10.355, + "loss/crossentropy": 2.6742849349975586, + "loss/hidden": 4.109375, + "loss/jsd": 0.0, + "loss/logits": 0.37552310526371, + "step": 576 + }, + { + "epoch": 0.036125, + "grad_norm": 4.40625, + "grad_norm_var": 0.3316243489583333, + "learning_rate": 0.0001, + "loss": 10.1256, + "loss/crossentropy": 2.3429067134857178, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.37690627574920654, + "step": 578 + }, + { + "epoch": 0.03625, + "grad_norm": 5.3125, + "grad_norm_var": 0.47980143229166666, + "learning_rate": 0.0001, + "loss": 10.6361, + "loss/crossentropy": 2.4363961219787598, + "loss/hidden": 4.1875, + "loss/jsd": 0.0, + "loss/logits": 0.4327695965766907, + "step": 580 + }, + { + "epoch": 0.036375, + "grad_norm": 4.46875, + "grad_norm_var": 0.49347330729166666, + "learning_rate": 0.0001, + "loss": 10.2303, + "loss/crossentropy": 2.6203149557113647, + "loss/hidden": 4.0625, + "loss/jsd": 0.0, + "loss/logits": 0.3839530050754547, + "step": 582 + }, + { + "epoch": 0.0365, + "grad_norm": 4.6875, + "grad_norm_var": 0.503125, + "learning_rate": 0.0001, + "loss": 10.4335, + "loss/crossentropy": 2.705429792404175, + "loss/hidden": 3.953125, + "loss/jsd": 0.0, + "loss/logits": 0.3913826197385788, + "step": 584 + }, + { + "epoch": 0.036625, + "grad_norm": 5.59375, + "grad_norm_var": 0.46151936848958336, + "learning_rate": 0.0001, + "loss": 10.1133, + "loss/crossentropy": 2.2338947057724, + "loss/hidden": 3.921875, + "loss/jsd": 0.0, + "loss/logits": 0.3503521531820297, + "step": 586 + }, + { + "epoch": 0.03675, + "grad_norm": 4.3125, + "grad_norm_var": 0.4554036458333333, + "learning_rate": 0.0001, + "loss": 10.2664, + "loss/crossentropy": 2.4711296558380127, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.39591944217681885, + "step": 588 + }, + { + "epoch": 0.036875, + "grad_norm": 4.40625, + "grad_norm_var": 0.38765869140625, + "learning_rate": 0.0001, + "loss": 10.2556, + "loss/crossentropy": 2.4357420206069946, + "loss/hidden": 3.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.38228775560855865, + "step": 590 + }, + { + "epoch": 0.037, + "grad_norm": 5.0625, + "grad_norm_var": 0.3698201497395833, + "learning_rate": 0.0001, + "loss": 10.1555, + "loss/crossentropy": 2.263747811317444, + "loss/hidden": 4.0, + "loss/jsd": 0.0, + "loss/logits": 0.40818026661872864, + "step": 592 + }, + { + "epoch": 0.037125, + "grad_norm": 4.5, + "grad_norm_var": 0.3490193684895833, + "learning_rate": 0.0001, + "loss": 9.9666, + "loss/crossentropy": 2.54870069026947, + "loss/hidden": 3.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.37873171269893646, + "step": 594 + }, + { + "epoch": 0.03725, + "grad_norm": 5.96875, + "grad_norm_var": 0.25271809895833336, + "learning_rate": 0.0001, + "loss": 10.1816, + "loss/crossentropy": 2.5280078649520874, + "loss/hidden": 3.9375, + "loss/jsd": 0.0, + "loss/logits": 0.3799082934856415, + "step": 596 + }, + { + "epoch": 0.037375, + "grad_norm": 4.53125, + "grad_norm_var": 0.266259765625, + "learning_rate": 0.0001, + "loss": 9.871, + "loss/crossentropy": 2.1612548232078552, + "loss/hidden": 3.921875, + "loss/jsd": 0.0, + "loss/logits": 0.3508923500776291, + "step": 598 + }, + { + "epoch": 0.0375, + "grad_norm": 5.4375, + "grad_norm_var": 0.28664957682291664, + "learning_rate": 0.0001, + "loss": 10.0122, + "loss/crossentropy": 2.33645498752594, + "loss/hidden": 4.046875, + "loss/jsd": 0.0, + "loss/logits": 0.34663376212120056, + "step": 600 + }, + { + "epoch": 0.037625, + "grad_norm": 4.15625, + "grad_norm_var": 0.25558268229166664, + "learning_rate": 0.0001, + "loss": 9.8853, + "loss/crossentropy": 2.263962745666504, + "loss/hidden": 3.890625, + "loss/jsd": 0.0, + "loss/logits": 0.3584403544664383, + "step": 602 + }, + { + "epoch": 0.03775, + "grad_norm": 4.9375, + "grad_norm_var": 0.243603515625, + "learning_rate": 0.0001, + "loss": 9.9895, + "loss/crossentropy": 2.4892383813858032, + "loss/hidden": 3.9375, + "loss/jsd": 0.0, + "loss/logits": 0.3693755269050598, + "step": 604 + }, + { + "epoch": 0.037875, + "grad_norm": 4.5, + "grad_norm_var": 0.218603515625, + "learning_rate": 0.0001, + "loss": 9.9086, + "loss/crossentropy": 2.5046552419662476, + "loss/hidden": 3.984375, + "loss/jsd": 0.0, + "loss/logits": 0.3778345286846161, + "step": 606 + }, + { + "epoch": 0.038, + "grad_norm": 4.46875, + "grad_norm_var": 0.21441650390625, + "learning_rate": 0.0001, + "loss": 9.9579, + "loss/crossentropy": 2.383268356323242, + "loss/hidden": 3.90625, + "loss/jsd": 0.0, + "loss/logits": 0.35156671702861786, + "step": 608 + }, + { + "epoch": 0.038125, + "grad_norm": 4.59375, + "grad_norm_var": 0.21565348307291668, + "learning_rate": 0.0001, + "loss": 10.1245, + "loss/crossentropy": 2.6101828813552856, + "loss/hidden": 3.96875, + "loss/jsd": 0.0, + "loss/logits": 0.36821986734867096, + "step": 610 + }, + { + "epoch": 0.03825, + "grad_norm": 4.5625, + "grad_norm_var": 0.10784098307291666, + "learning_rate": 0.0001, + "loss": 9.6363, + "loss/crossentropy": 2.154883623123169, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.351752445101738, + "step": 612 + }, + { + "epoch": 0.038375, + "grad_norm": 4.65625, + "grad_norm_var": 0.10461832682291666, + "learning_rate": 0.0001, + "loss": 10.1564, + "loss/crossentropy": 2.7461551427841187, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.40302354097366333, + "step": 614 + }, + { + "epoch": 0.0385, + "grad_norm": 4.65625, + "grad_norm_var": 0.04967447916666667, + "learning_rate": 0.0001, + "loss": 9.964, + "loss/crossentropy": 2.215000867843628, + "loss/hidden": 4.046875, + "loss/jsd": 0.0, + "loss/logits": 0.37297672033309937, + "step": 616 + }, + { + "epoch": 0.038625, + "grad_norm": 5.03125, + "grad_norm_var": 0.0564453125, + "learning_rate": 0.0001, + "loss": 9.8702, + "loss/crossentropy": 2.5331451892852783, + "loss/hidden": 3.90625, + "loss/jsd": 0.0, + "loss/logits": 0.3476633280515671, + "step": 618 + }, + { + "epoch": 0.03875, + "grad_norm": 4.6875, + "grad_norm_var": 0.07550455729166666, + "learning_rate": 0.0001, + "loss": 10.4216, + "loss/crossentropy": 2.4520362615585327, + "loss/hidden": 3.96875, + "loss/jsd": 0.0, + "loss/logits": 0.38907913863658905, + "step": 620 + }, + { + "epoch": 0.038875, + "grad_norm": 4.1875, + "grad_norm_var": 0.0876953125, + "learning_rate": 0.0001, + "loss": 9.9443, + "loss/crossentropy": 2.1309529542922974, + "loss/hidden": 3.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.3472695052623749, + "step": 622 + }, + { + "epoch": 0.039, + "grad_norm": 5.0625, + "grad_norm_var": 0.10517171223958334, + "learning_rate": 0.0001, + "loss": 10.2231, + "loss/crossentropy": 2.5359339714050293, + "loss/hidden": 3.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.3966711014509201, + "step": 624 + }, + { + "epoch": 0.039125, + "grad_norm": 5.34375, + "grad_norm_var": 0.13763020833333334, + "learning_rate": 0.0001, + "loss": 10.0318, + "loss/crossentropy": 2.4495298862457275, + "loss/hidden": 3.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.3523852229118347, + "step": 626 + }, + { + "epoch": 0.03925, + "grad_norm": 4.6875, + "grad_norm_var": 0.12646077473958334, + "learning_rate": 0.0001, + "loss": 10.1339, + "loss/crossentropy": 2.4048619270324707, + "loss/hidden": 3.953125, + "loss/jsd": 0.0, + "loss/logits": 0.37286487221717834, + "step": 628 + }, + { + "epoch": 0.039375, + "grad_norm": 4.0625, + "grad_norm_var": 0.15050455729166667, + "learning_rate": 0.0001, + "loss": 9.5416, + "loss/crossentropy": 2.2729824781417847, + "loss/hidden": 3.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.35299360752105713, + "step": 630 + }, + { + "epoch": 0.0395, + "grad_norm": 4.625, + "grad_norm_var": 0.14390869140625, + "learning_rate": 0.0001, + "loss": 9.6436, + "loss/crossentropy": 2.464027762413025, + "loss/hidden": 3.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.3433645963668823, + "step": 632 + }, + { + "epoch": 0.039625, + "grad_norm": 4.625, + "grad_norm_var": 0.14296468098958334, + "learning_rate": 0.0001, + "loss": 10.0132, + "loss/crossentropy": 2.260706663131714, + "loss/hidden": 3.9375, + "loss/jsd": 0.0, + "loss/logits": 0.358672633767128, + "step": 634 + }, + { + "epoch": 0.03975, + "grad_norm": 5.34375, + "grad_norm_var": 0.16431884765625, + "learning_rate": 0.0001, + "loss": 10.0665, + "loss/crossentropy": 2.443928599357605, + "loss/hidden": 3.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.34917180240154266, + "step": 636 + }, + { + "epoch": 0.039875, + "grad_norm": 4.15625, + "grad_norm_var": 0.16360270182291667, + "learning_rate": 0.0001, + "loss": 9.8865, + "loss/crossentropy": 2.355438470840454, + "loss/hidden": 3.921875, + "loss/jsd": 0.0, + "loss/logits": 0.3681969791650772, + "step": 638 + }, + { + "epoch": 0.04, + "grad_norm": 5.15625, + "grad_norm_var": 0.17375895182291667, + "learning_rate": 0.0001, + "loss": 9.988, + "loss/crossentropy": 2.403064250946045, + "loss/hidden": 3.890625, + "loss/jsd": 0.0, + "loss/logits": 0.3809404671192169, + "step": 640 + }, + { + "epoch": 0.040125, + "grad_norm": 5.25, + "grad_norm_var": 0.16539306640625, + "learning_rate": 0.0001, + "loss": 9.8894, + "loss/crossentropy": 2.277324080467224, + "loss/hidden": 3.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.33590181171894073, + "step": 642 + }, + { + "epoch": 0.04025, + "grad_norm": 4.75, + "grad_norm_var": 0.16705729166666666, + "learning_rate": 0.0001, + "loss": 10.0104, + "loss/crossentropy": 2.6433370113372803, + "loss/hidden": 3.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.3841419368982315, + "step": 644 + }, + { + "epoch": 0.040375, + "grad_norm": 4.375, + "grad_norm_var": 0.148828125, + "learning_rate": 0.0001, + "loss": 9.9156, + "loss/crossentropy": 2.5485310554504395, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.34872615337371826, + "step": 646 + }, + { + "epoch": 0.0405, + "grad_norm": 4.0, + "grad_norm_var": 0.17229410807291667, + "learning_rate": 0.0001, + "loss": 9.6353, + "loss/crossentropy": 2.5063722133636475, + "loss/hidden": 3.859375, + "loss/jsd": 0.0, + "loss/logits": 0.36940453946590424, + "step": 648 + }, + { + "epoch": 0.040625, + "grad_norm": 4.65625, + "grad_norm_var": 0.18136393229166667, + "learning_rate": 0.0001, + "loss": 9.9078, + "loss/crossentropy": 2.39488685131073, + "loss/hidden": 3.875, + "loss/jsd": 0.0, + "loss/logits": 0.3586680442094803, + "step": 650 + }, + { + "epoch": 0.04075, + "grad_norm": 4.59375, + "grad_norm_var": 0.13019205729166666, + "learning_rate": 0.0001, + "loss": 9.9067, + "loss/crossentropy": 2.377121686935425, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.36309733986854553, + "step": 652 + }, + { + "epoch": 0.040875, + "grad_norm": 4.28125, + "grad_norm_var": 0.12916666666666668, + "learning_rate": 0.0001, + "loss": 10.099, + "loss/crossentropy": 2.5704420804977417, + "loss/hidden": 3.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.3873720318078995, + "step": 654 + }, + { + "epoch": 0.041, + "grad_norm": 5.90625, + "grad_norm_var": 0.21584879557291667, + "learning_rate": 0.0001, + "loss": 9.9181, + "loss/crossentropy": 2.2711371183395386, + "loss/hidden": 4.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.3648662865161896, + "step": 656 + }, + { + "epoch": 0.041125, + "grad_norm": 5.25, + "grad_norm_var": 0.21679280598958334, + "learning_rate": 0.0001, + "loss": 10.0834, + "loss/crossentropy": 2.5198888778686523, + "loss/hidden": 3.9375, + "loss/jsd": 0.0, + "loss/logits": 0.3765381723642349, + "step": 658 + }, + { + "epoch": 0.04125, + "grad_norm": 4.40625, + "grad_norm_var": 0.21443684895833334, + "learning_rate": 0.0001, + "loss": 9.7288, + "loss/crossentropy": 2.2896225452423096, + "loss/hidden": 3.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.35969309508800507, + "step": 660 + }, + { + "epoch": 0.041375, + "grad_norm": 4.5625, + "grad_norm_var": 0.21106770833333333, + "learning_rate": 0.0001, + "loss": 10.0562, + "loss/crossentropy": 2.4178755283355713, + "loss/hidden": 3.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.3961552679538727, + "step": 662 + }, + { + "epoch": 0.0415, + "grad_norm": 4.71875, + "grad_norm_var": 0.19529622395833332, + "learning_rate": 0.0001, + "loss": 9.9744, + "loss/crossentropy": 2.547677516937256, + "loss/hidden": 3.828125, + "loss/jsd": 0.0, + "loss/logits": 0.3570811301469803, + "step": 664 + }, + { + "epoch": 0.041625, + "grad_norm": 4.78125, + "grad_norm_var": 0.17502848307291666, + "learning_rate": 0.0001, + "loss": 9.8543, + "loss/crossentropy": 2.4284908771514893, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.3434506356716156, + "step": 666 + }, + { + "epoch": 0.04175, + "grad_norm": 4.1875, + "grad_norm_var": 0.19075520833333334, + "learning_rate": 0.0001, + "loss": 9.9655, + "loss/crossentropy": 2.108651876449585, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.36953288316726685, + "step": 668 + }, + { + "epoch": 0.041875, + "grad_norm": 4.71875, + "grad_norm_var": 0.17646077473958333, + "learning_rate": 0.0001, + "loss": 10.0564, + "loss/crossentropy": 2.3314043283462524, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.3440769463777542, + "step": 670 + }, + { + "epoch": 0.042, + "grad_norm": 4.65625, + "grad_norm_var": 0.06848551432291666, + "learning_rate": 0.0001, + "loss": 9.8599, + "loss/crossentropy": 2.3711284399032593, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.34241366386413574, + "step": 672 + }, + { + "epoch": 0.042125, + "grad_norm": 5.0, + "grad_norm_var": 0.05006103515625, + "learning_rate": 0.0001, + "loss": 9.9082, + "loss/crossentropy": 2.3005160093307495, + "loss/hidden": 3.8125, + "loss/jsd": 0.0, + "loss/logits": 0.3360006958246231, + "step": 674 + }, + { + "epoch": 0.04225, + "grad_norm": 4.09375, + "grad_norm_var": 0.068359375, + "learning_rate": 0.0001, + "loss": 10.0805, + "loss/crossentropy": 2.590296149253845, + "loss/hidden": 3.890625, + "loss/jsd": 0.0, + "loss/logits": 0.3790188133716583, + "step": 676 + }, + { + "epoch": 0.042375, + "grad_norm": 4.3125, + "grad_norm_var": 0.080322265625, + "learning_rate": 0.0001, + "loss": 9.8713, + "loss/crossentropy": 2.582974672317505, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.3644176125526428, + "step": 678 + }, + { + "epoch": 0.0425, + "grad_norm": 4.0625, + "grad_norm_var": 0.09000244140625, + "learning_rate": 0.0001, + "loss": 10.0148, + "loss/crossentropy": 2.6205986738204956, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.3867751955986023, + "step": 680 + }, + { + "epoch": 0.042625, + "grad_norm": 4.25, + "grad_norm_var": 0.08826497395833334, + "learning_rate": 0.0001, + "loss": 9.6616, + "loss/crossentropy": 2.3413681983947754, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.35291582345962524, + "step": 682 + }, + { + "epoch": 0.04275, + "grad_norm": 4.78125, + "grad_norm_var": 0.09547119140625, + "learning_rate": 0.0001, + "loss": 9.7887, + "loss/crossentropy": 2.524027109146118, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.3748561441898346, + "step": 684 + }, + { + "epoch": 0.042875, + "grad_norm": 4.25, + "grad_norm_var": 0.09693603515625, + "learning_rate": 0.0001, + "loss": 9.7663, + "loss/crossentropy": 2.586169123649597, + "loss/hidden": 3.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.3734195679426193, + "step": 686 + }, + { + "epoch": 0.043, + "grad_norm": 4.25, + "grad_norm_var": 0.098291015625, + "learning_rate": 0.0001, + "loss": 10.1415, + "loss/crossentropy": 2.575216293334961, + "loss/hidden": 3.921875, + "loss/jsd": 0.0, + "loss/logits": 0.37598639726638794, + "step": 688 + }, + { + "epoch": 0.043125, + "grad_norm": 4.4375, + "grad_norm_var": 0.1171875, + "learning_rate": 0.0001, + "loss": 10.0767, + "loss/crossentropy": 2.782447099685669, + "loss/hidden": 3.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.39427442848682404, + "step": 690 + }, + { + "epoch": 0.04325, + "grad_norm": 4.59375, + "grad_norm_var": 0.10546468098958334, + "learning_rate": 0.0001, + "loss": 10.1002, + "loss/crossentropy": 2.4408079385757446, + "loss/hidden": 3.9375, + "loss/jsd": 0.0, + "loss/logits": 0.39183132350444794, + "step": 692 + }, + { + "epoch": 0.043375, + "grad_norm": 5.09375, + "grad_norm_var": 0.11373291015625, + "learning_rate": 0.0001, + "loss": 9.9914, + "loss/crossentropy": 2.428073763847351, + "loss/hidden": 3.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.3673141598701477, + "step": 694 + }, + { + "epoch": 0.0435, + "grad_norm": 4.1875, + "grad_norm_var": 0.11808268229166667, + "learning_rate": 0.0001, + "loss": 9.8208, + "loss/crossentropy": 2.6366835832595825, + "loss/hidden": 3.78125, + "loss/jsd": 0.0, + "loss/logits": 0.36033181846141815, + "step": 696 + }, + { + "epoch": 0.043625, + "grad_norm": 5.40625, + "grad_norm_var": 0.14921468098958332, + "learning_rate": 0.0001, + "loss": 10.0662, + "loss/crossentropy": 2.119105100631714, + "loss/hidden": 3.984375, + "loss/jsd": 0.0, + "loss/logits": 0.3732317090034485, + "step": 698 + }, + { + "epoch": 0.04375, + "grad_norm": 4.34375, + "grad_norm_var": 0.14996337890625, + "learning_rate": 0.0001, + "loss": 10.0521, + "loss/crossentropy": 2.4997419118881226, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.3387012630701065, + "step": 700 + }, + { + "epoch": 0.043875, + "grad_norm": 3.921875, + "grad_norm_var": 0.1813873291015625, + "learning_rate": 0.0001, + "loss": 9.7343, + "loss/crossentropy": 2.2400662899017334, + "loss/hidden": 3.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.3351885676383972, + "step": 702 + }, + { + "epoch": 0.044, + "grad_norm": 5.21875, + "grad_norm_var": 0.21665751139322917, + "learning_rate": 0.0001, + "loss": 9.8692, + "loss/crossentropy": 2.2805423736572266, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.33280137181282043, + "step": 704 + }, + { + "epoch": 0.044125, + "grad_norm": 4.8125, + "grad_norm_var": 0.24158426920572917, + "learning_rate": 0.0001, + "loss": 10.2821, + "loss/crossentropy": 2.5463110208511353, + "loss/hidden": 4.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.4047670066356659, + "step": 706 + }, + { + "epoch": 0.04425, + "grad_norm": 4.28125, + "grad_norm_var": 0.25898335774739584, + "learning_rate": 0.0001, + "loss": 9.893, + "loss/crossentropy": 2.5133782625198364, + "loss/hidden": 3.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.3492252826690674, + "step": 708 + }, + { + "epoch": 0.044375, + "grad_norm": 4.40625, + "grad_norm_var": 0.2576080322265625, + "learning_rate": 0.0001, + "loss": 9.7662, + "loss/crossentropy": 2.58932888507843, + "loss/hidden": 3.890625, + "loss/jsd": 0.0, + "loss/logits": 0.36606909334659576, + "step": 710 + }, + { + "epoch": 0.0445, + "grad_norm": 4.21875, + "grad_norm_var": 0.24614156087239583, + "learning_rate": 0.0001, + "loss": 9.8397, + "loss/crossentropy": 2.5551047325134277, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.37199999392032623, + "step": 712 + }, + { + "epoch": 0.044625, + "grad_norm": 4.875, + "grad_norm_var": 0.21269429524739583, + "learning_rate": 0.0001, + "loss": 9.6663, + "loss/crossentropy": 2.2038984298706055, + "loss/hidden": 3.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.355471596121788, + "step": 714 + }, + { + "epoch": 0.04475, + "grad_norm": 4.1875, + "grad_norm_var": 0.22046610514322917, + "learning_rate": 0.0001, + "loss": 9.8526, + "loss/crossentropy": 2.4986603260040283, + "loss/hidden": 3.890625, + "loss/jsd": 0.0, + "loss/logits": 0.3514595031738281, + "step": 716 + }, + { + "epoch": 0.044875, + "grad_norm": 4.5, + "grad_norm_var": 0.17919514973958334, + "learning_rate": 0.0001, + "loss": 9.9912, + "loss/crossentropy": 2.67462694644928, + "loss/hidden": 3.8125, + "loss/jsd": 0.0, + "loss/logits": 0.36870990693569183, + "step": 718 + }, + { + "epoch": 0.045, + "grad_norm": 4.09375, + "grad_norm_var": 0.14490559895833333, + "learning_rate": 0.0001, + "loss": 10.125, + "loss/crossentropy": 2.4971606731414795, + "loss/hidden": 3.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.39218752086162567, + "step": 720 + }, + { + "epoch": 0.045125, + "grad_norm": 4.40625, + "grad_norm_var": 0.060791015625, + "learning_rate": 0.0001, + "loss": 9.7016, + "loss/crossentropy": 2.4049805402755737, + "loss/hidden": 3.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.32926052808761597, + "step": 722 + }, + { + "epoch": 0.04525, + "grad_norm": 4.28125, + "grad_norm_var": 0.055859375, + "learning_rate": 0.0001, + "loss": 9.8177, + "loss/crossentropy": 2.590659022331238, + "loss/hidden": 3.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.3945636451244354, + "step": 724 + }, + { + "epoch": 0.045375, + "grad_norm": 4.96875, + "grad_norm_var": 0.07823893229166666, + "learning_rate": 0.0001, + "loss": 9.6899, + "loss/crossentropy": 2.2857325077056885, + "loss/hidden": 3.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.35235145688056946, + "step": 726 + }, + { + "epoch": 0.0455, + "grad_norm": 4.15625, + "grad_norm_var": 0.08201497395833333, + "learning_rate": 0.0001, + "loss": 9.6263, + "loss/crossentropy": 2.201639175415039, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.33390986919403076, + "step": 728 + }, + { + "epoch": 0.045625, + "grad_norm": 5.03125, + "grad_norm_var": 0.10592447916666667, + "learning_rate": 0.0001, + "loss": 9.9654, + "loss/crossentropy": 2.546342372894287, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.3447662442922592, + "step": 730 + }, + { + "epoch": 0.04575, + "grad_norm": 4.3125, + "grad_norm_var": 0.11259358723958333, + "learning_rate": 0.0001, + "loss": 9.6529, + "loss/crossentropy": 2.465666890144348, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.3506108671426773, + "step": 732 + }, + { + "epoch": 0.045875, + "grad_norm": 4.34375, + "grad_norm_var": 0.11555989583333333, + "learning_rate": 0.0001, + "loss": 9.7691, + "loss/crossentropy": 2.4628361463546753, + "loss/hidden": 3.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.3611077666282654, + "step": 734 + }, + { + "epoch": 0.046, + "grad_norm": 4.53125, + "grad_norm_var": 0.10852864583333334, + "learning_rate": 0.0001, + "loss": 9.7908, + "loss/crossentropy": 2.450587034225464, + "loss/hidden": 3.859375, + "loss/jsd": 0.0, + "loss/logits": 0.36645807325839996, + "step": 736 + }, + { + "epoch": 0.046125, + "grad_norm": 4.75, + "grad_norm_var": 0.10998942057291666, + "learning_rate": 0.0001, + "loss": 10.0238, + "loss/crossentropy": 2.5827871561050415, + "loss/hidden": 3.78125, + "loss/jsd": 0.0, + "loss/logits": 0.3806481957435608, + "step": 738 + }, + { + "epoch": 0.04625, + "grad_norm": 3.953125, + "grad_norm_var": 0.11862691243489583, + "learning_rate": 0.0001, + "loss": 9.453, + "loss/crossentropy": 2.408301830291748, + "loss/hidden": 3.78125, + "loss/jsd": 0.0, + "loss/logits": 0.3236120641231537, + "step": 740 + }, + { + "epoch": 0.046375, + "grad_norm": 4.34375, + "grad_norm_var": 0.0968658447265625, + "learning_rate": 0.0001, + "loss": 9.7187, + "loss/crossentropy": 2.4682952165603638, + "loss/hidden": 3.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.40484650433063507, + "step": 742 + }, + { + "epoch": 0.0465, + "grad_norm": 4.5, + "grad_norm_var": 0.1087799072265625, + "learning_rate": 0.0001, + "loss": 9.8899, + "loss/crossentropy": 2.7526875734329224, + "loss/hidden": 3.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.3805859088897705, + "step": 744 + }, + { + "epoch": 0.046625, + "grad_norm": 4.59375, + "grad_norm_var": 0.08279520670572917, + "learning_rate": 0.0001, + "loss": 9.8541, + "loss/crossentropy": 2.268938183784485, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.3416333645582199, + "step": 746 + }, + { + "epoch": 0.04675, + "grad_norm": 4.5, + "grad_norm_var": 0.07593485514322916, + "learning_rate": 0.0001, + "loss": 9.4353, + "loss/crossentropy": 2.4165321588516235, + "loss/hidden": 3.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.33534686267375946, + "step": 748 + }, + { + "epoch": 0.046875, + "grad_norm": 4.21875, + "grad_norm_var": 0.09172261555989583, + "learning_rate": 0.0001, + "loss": 9.4508, + "loss/crossentropy": 2.4512590169906616, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.3545406609773636, + "step": 750 + }, + { + "epoch": 0.047, + "grad_norm": 4.28125, + "grad_norm_var": 0.09014383951822917, + "learning_rate": 0.0001, + "loss": 10.0917, + "loss/crossentropy": 2.545518636703491, + "loss/hidden": 3.96875, + "loss/jsd": 0.0, + "loss/logits": 0.49751946330070496, + "step": 752 + }, + { + "epoch": 0.047125, + "grad_norm": 4.34375, + "grad_norm_var": 0.0890533447265625, + "learning_rate": 0.0001, + "loss": 9.4346, + "loss/crossentropy": 2.2614606618881226, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.34531380236148834, + "step": 754 + }, + { + "epoch": 0.04725, + "grad_norm": 4.3125, + "grad_norm_var": 0.07870686848958333, + "learning_rate": 0.0001, + "loss": 9.3389, + "loss/crossentropy": 2.3133562803268433, + "loss/hidden": 3.78125, + "loss/jsd": 0.0, + "loss/logits": 0.3800860345363617, + "step": 756 + }, + { + "epoch": 0.047375, + "grad_norm": 4.75, + "grad_norm_var": 0.09312744140625, + "learning_rate": 0.0001, + "loss": 9.9658, + "loss/crossentropy": 2.5053844451904297, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.348347008228302, + "step": 758 + }, + { + "epoch": 0.0475, + "grad_norm": 4.0625, + "grad_norm_var": 0.06365559895833334, + "learning_rate": 0.0001, + "loss": 9.5957, + "loss/crossentropy": 2.1510268449783325, + "loss/hidden": 3.78125, + "loss/jsd": 0.0, + "loss/logits": 0.3271169662475586, + "step": 760 + }, + { + "epoch": 0.047625, + "grad_norm": 3.796875, + "grad_norm_var": 0.0678131103515625, + "learning_rate": 0.0001, + "loss": 9.5272, + "loss/crossentropy": 2.1917725801467896, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.32911764085292816, + "step": 762 + }, + { + "epoch": 0.04775, + "grad_norm": 4.625, + "grad_norm_var": 0.07316792805989583, + "learning_rate": 0.0001, + "loss": 9.6196, + "loss/crossentropy": 2.262703061103821, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.3407554179430008, + "step": 764 + }, + { + "epoch": 0.047875, + "grad_norm": 4.21875, + "grad_norm_var": 0.0635894775390625, + "learning_rate": 0.0001, + "loss": 9.5615, + "loss/crossentropy": 2.2358585596084595, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.34859590232372284, + "step": 766 + }, + { + "epoch": 0.048, + "grad_norm": 4.5625, + "grad_norm_var": 0.0727691650390625, + "learning_rate": 0.0001, + "loss": 9.7362, + "loss/crossentropy": 2.273680090904236, + "loss/hidden": 3.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.3434666693210602, + "step": 768 + }, + { + "epoch": 0.048125, + "grad_norm": 4.78125, + "grad_norm_var": 0.06664937337239583, + "learning_rate": 0.0001, + "loss": 9.4517, + "loss/crossentropy": 2.2003235816955566, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.34724757075309753, + "step": 770 + }, + { + "epoch": 0.04825, + "grad_norm": 4.0625, + "grad_norm_var": 0.0718170166015625, + "learning_rate": 0.0001, + "loss": 10.0496, + "loss/crossentropy": 2.356285572052002, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.34804899990558624, + "step": 772 + }, + { + "epoch": 0.048375, + "grad_norm": 3.75, + "grad_norm_var": 0.0806793212890625, + "learning_rate": 0.0001, + "loss": 9.4018, + "loss/crossentropy": 2.2518192529678345, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.3232182711362839, + "step": 774 + }, + { + "epoch": 0.0485, + "grad_norm": 4.75, + "grad_norm_var": 0.0930572509765625, + "learning_rate": 0.0001, + "loss": 9.6718, + "loss/crossentropy": 2.596095561981201, + "loss/hidden": 3.765625, + "loss/jsd": 0.0, + "loss/logits": 0.3486844599246979, + "step": 776 + }, + { + "epoch": 0.048625, + "grad_norm": 4.125, + "grad_norm_var": 0.0822265625, + "learning_rate": 0.0001, + "loss": 9.6132, + "loss/crossentropy": 2.3995965719223022, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.3591943085193634, + "step": 778 + }, + { + "epoch": 0.04875, + "grad_norm": 4.375, + "grad_norm_var": 0.07615559895833333, + "learning_rate": 0.0001, + "loss": 9.664, + "loss/crossentropy": 2.212980270385742, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.35707785189151764, + "step": 780 + }, + { + "epoch": 0.048875, + "grad_norm": 4.21875, + "grad_norm_var": 0.0861480712890625, + "learning_rate": 0.0001, + "loss": 9.7829, + "loss/crossentropy": 2.3209805488586426, + "loss/hidden": 3.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.33315178751945496, + "step": 782 + }, + { + "epoch": 0.049, + "grad_norm": 4.0625, + "grad_norm_var": 0.0819976806640625, + "learning_rate": 0.0001, + "loss": 9.6294, + "loss/crossentropy": 2.4062753915786743, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.3330836743116379, + "step": 784 + }, + { + "epoch": 0.049125, + "grad_norm": 4.21875, + "grad_norm_var": 0.06301167805989584, + "learning_rate": 0.0001, + "loss": 9.6967, + "loss/crossentropy": 2.3731807470321655, + "loss/hidden": 3.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.37791262567043304, + "step": 786 + }, + { + "epoch": 0.04925, + "grad_norm": 4.46875, + "grad_norm_var": 0.07166239420572916, + "learning_rate": 0.0001, + "loss": 9.6106, + "loss/crossentropy": 2.1310253143310547, + "loss/hidden": 3.765625, + "loss/jsd": 0.0, + "loss/logits": 0.32675565779209137, + "step": 788 + }, + { + "epoch": 0.049375, + "grad_norm": 3.828125, + "grad_norm_var": 0.06562093098958334, + "learning_rate": 0.0001, + "loss": 9.5732, + "loss/crossentropy": 2.2886255979537964, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.37167444825172424, + "step": 790 + }, + { + "epoch": 0.0495, + "grad_norm": 4.34375, + "grad_norm_var": 0.05289306640625, + "learning_rate": 0.0001, + "loss": 9.5136, + "loss/crossentropy": 2.322494864463806, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.3283519744873047, + "step": 792 + }, + { + "epoch": 0.049625, + "grad_norm": 3.96875, + "grad_norm_var": 0.047196451822916666, + "learning_rate": 0.0001, + "loss": 9.7672, + "loss/crossentropy": 2.7288074493408203, + "loss/hidden": 3.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.370631605386734, + "step": 794 + }, + { + "epoch": 0.04975, + "grad_norm": 4.5, + "grad_norm_var": 0.04845377604166667, + "learning_rate": 0.0001, + "loss": 9.666, + "loss/crossentropy": 2.1383297443389893, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.32776103913784027, + "step": 796 + }, + { + "epoch": 0.049875, + "grad_norm": 4.09375, + "grad_norm_var": 0.0453033447265625, + "learning_rate": 0.0001, + "loss": 9.623, + "loss/crossentropy": 2.4972459077835083, + "loss/hidden": 3.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.36165888607501984, + "step": 798 + }, + { + "epoch": 0.05, + "grad_norm": 4.09375, + "grad_norm_var": 0.05042317708333333, + "learning_rate": 0.0001, + "loss": 9.2915, + "loss/crossentropy": 2.19729745388031, + "loss/hidden": 3.734375, + "loss/jsd": 0.0, + "loss/logits": 0.318468302488327, + "step": 800 + }, + { + "epoch": 0.050125, + "grad_norm": 4.125, + "grad_norm_var": 0.059342447916666666, + "learning_rate": 0.0001, + "loss": 9.3799, + "loss/crossentropy": 2.194010615348816, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.31498509645462036, + "step": 802 + }, + { + "epoch": 0.05025, + "grad_norm": 3.96875, + "grad_norm_var": 0.04267578125, + "learning_rate": 0.0001, + "loss": 9.5517, + "loss/crossentropy": 2.269457697868347, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.3178607076406479, + "step": 804 + }, + { + "epoch": 0.050375, + "grad_norm": 4.40625, + "grad_norm_var": 0.0394927978515625, + "learning_rate": 0.0001, + "loss": 9.2452, + "loss/crossentropy": 2.2216137647628784, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.3251790404319763, + "step": 806 + }, + { + "epoch": 0.0505, + "grad_norm": 4.28125, + "grad_norm_var": 0.04243876139322917, + "learning_rate": 0.0001, + "loss": 9.7497, + "loss/crossentropy": 2.6958311796188354, + "loss/hidden": 3.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.3631015717983246, + "step": 808 + }, + { + "epoch": 0.050625, + "grad_norm": 4.59375, + "grad_norm_var": 0.0543121337890625, + "learning_rate": 0.0001, + "loss": 9.7743, + "loss/crossentropy": 2.6329739093780518, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.3427456319332123, + "step": 810 + }, + { + "epoch": 0.05075, + "grad_norm": 4.21875, + "grad_norm_var": 0.0827789306640625, + "learning_rate": 0.0001, + "loss": 9.861, + "loss/crossentropy": 2.41109561920166, + "loss/hidden": 3.859375, + "loss/jsd": 0.0, + "loss/logits": 0.38177673518657684, + "step": 812 + }, + { + "epoch": 0.050875, + "grad_norm": 4.125, + "grad_norm_var": 0.09693603515625, + "learning_rate": 0.0001, + "loss": 9.3459, + "loss/crossentropy": 2.413679838180542, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.3323095142841339, + "step": 814 + }, + { + "epoch": 0.051, + "grad_norm": 3.96875, + "grad_norm_var": 0.10078837076822916, + "learning_rate": 0.0001, + "loss": 9.2878, + "loss/crossentropy": 2.4613407850265503, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.36210502684116364, + "step": 816 + }, + { + "epoch": 0.051125, + "grad_norm": 4.28125, + "grad_norm_var": 0.09621480305989584, + "learning_rate": 0.0001, + "loss": 9.5978, + "loss/crossentropy": 2.3388036489486694, + "loss/hidden": 3.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.33505263924598694, + "step": 818 + }, + { + "epoch": 0.05125, + "grad_norm": 3.859375, + "grad_norm_var": 0.10100911458333334, + "learning_rate": 0.0001, + "loss": 9.5206, + "loss/crossentropy": 2.504610538482666, + "loss/hidden": 3.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.35333533585071564, + "step": 820 + }, + { + "epoch": 0.051375, + "grad_norm": 4.53125, + "grad_norm_var": 0.10614827473958334, + "learning_rate": 0.0001, + "loss": 9.707, + "loss/crossentropy": 2.3531359434127808, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.3614940941333771, + "step": 822 + }, + { + "epoch": 0.0515, + "grad_norm": 3.890625, + "grad_norm_var": 0.11204325358072917, + "learning_rate": 0.0001, + "loss": 9.3598, + "loss/crossentropy": 2.0972710251808167, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.31728605926036835, + "step": 824 + }, + { + "epoch": 0.051625, + "grad_norm": 4.1875, + "grad_norm_var": 0.09622395833333333, + "learning_rate": 0.0001, + "loss": 9.3816, + "loss/crossentropy": 2.275819420814514, + "loss/hidden": 3.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.34381118416786194, + "step": 826 + }, + { + "epoch": 0.05175, + "grad_norm": 4.125, + "grad_norm_var": 0.04998372395833333, + "learning_rate": 0.0001, + "loss": 9.6247, + "loss/crossentropy": 2.45046067237854, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.3419671058654785, + "step": 828 + }, + { + "epoch": 0.051875, + "grad_norm": 4.53125, + "grad_norm_var": 0.05650126139322917, + "learning_rate": 0.0001, + "loss": 9.8951, + "loss/crossentropy": 2.7096316814422607, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.34755463898181915, + "step": 830 + }, + { + "epoch": 0.052, + "grad_norm": 4.3125, + "grad_norm_var": 0.05390523274739583, + "learning_rate": 0.0001, + "loss": 9.5586, + "loss/crossentropy": 2.4081461429595947, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.35109463334083557, + "step": 832 + }, + { + "epoch": 0.052125, + "grad_norm": 4.25, + "grad_norm_var": 0.0544830322265625, + "learning_rate": 0.0001, + "loss": 9.338, + "loss/crossentropy": 2.0195173621177673, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3229655623435974, + "step": 834 + }, + { + "epoch": 0.05225, + "grad_norm": 4.3125, + "grad_norm_var": 0.05168863932291667, + "learning_rate": 0.0001, + "loss": 9.4718, + "loss/crossentropy": 2.262600541114807, + "loss/hidden": 3.734375, + "loss/jsd": 0.0, + "loss/logits": 0.33351391553878784, + "step": 836 + }, + { + "epoch": 0.052375, + "grad_norm": 3.9375, + "grad_norm_var": 0.05157877604166667, + "learning_rate": 0.0001, + "loss": 9.5338, + "loss/crossentropy": 2.387451410293579, + "loss/hidden": 3.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.3783055394887924, + "step": 838 + }, + { + "epoch": 0.0525, + "grad_norm": 3.796875, + "grad_norm_var": 0.04944661458333333, + "learning_rate": 0.0001, + "loss": 9.5232, + "loss/crossentropy": 2.5872695446014404, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.34008149802684784, + "step": 840 + }, + { + "epoch": 0.052625, + "grad_norm": 3.984375, + "grad_norm_var": 0.048990885416666664, + "learning_rate": 0.0001, + "loss": 9.4675, + "loss/crossentropy": 2.7655253410339355, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.340924471616745, + "step": 842 + }, + { + "epoch": 0.05275, + "grad_norm": 3.859375, + "grad_norm_var": 0.050633748372395836, + "learning_rate": 0.0001, + "loss": 9.2679, + "loss/crossentropy": 2.4758448600769043, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.32951878011226654, + "step": 844 + }, + { + "epoch": 0.052875, + "grad_norm": 4.375, + "grad_norm_var": 0.06552632649739583, + "learning_rate": 0.0001, + "loss": 9.9271, + "loss/crossentropy": 2.6429070234298706, + "loss/hidden": 3.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.3814462423324585, + "step": 846 + }, + { + "epoch": 0.053, + "grad_norm": 4.625, + "grad_norm_var": 0.08034566243489584, + "learning_rate": 0.0001, + "loss": 10.0684, + "loss/crossentropy": 2.209423542022705, + "loss/hidden": 3.90625, + "loss/jsd": 0.0, + "loss/logits": 0.3319186717271805, + "step": 848 + }, + { + "epoch": 0.053125, + "grad_norm": 3.96875, + "grad_norm_var": 0.0769683837890625, + "learning_rate": 0.0001, + "loss": 9.5685, + "loss/crossentropy": 2.2579764127731323, + "loss/hidden": 3.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.3442992717027664, + "step": 850 + }, + { + "epoch": 0.05325, + "grad_norm": 4.84375, + "grad_norm_var": 0.1148834228515625, + "learning_rate": 0.0001, + "loss": 9.4361, + "loss/crossentropy": 2.39439857006073, + "loss/hidden": 3.734375, + "loss/jsd": 0.0, + "loss/logits": 0.3444969654083252, + "step": 852 + }, + { + "epoch": 0.053375, + "grad_norm": 4.28125, + "grad_norm_var": 0.10398661295572917, + "learning_rate": 0.0001, + "loss": 9.5651, + "loss/crossentropy": 2.504552960395813, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.34091413021087646, + "step": 854 + }, + { + "epoch": 0.0535, + "grad_norm": 4.59375, + "grad_norm_var": 0.09475911458333333, + "learning_rate": 0.0001, + "loss": 9.7258, + "loss/crossentropy": 2.4847280979156494, + "loss/hidden": 3.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.3863519877195358, + "step": 856 + }, + { + "epoch": 0.053625, + "grad_norm": 4.125, + "grad_norm_var": 0.08271382649739584, + "learning_rate": 0.0001, + "loss": 9.5068, + "loss/crossentropy": 2.5517282485961914, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.33669134974479675, + "step": 858 + }, + { + "epoch": 0.05375, + "grad_norm": 3.90625, + "grad_norm_var": 0.07314046223958333, + "learning_rate": 0.0001, + "loss": 9.5156, + "loss/crossentropy": 2.450470209121704, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.35644595324993134, + "step": 860 + }, + { + "epoch": 0.053875, + "grad_norm": 4.40625, + "grad_norm_var": 0.0788970947265625, + "learning_rate": 0.0001, + "loss": 9.7869, + "loss/crossentropy": 2.687352776527405, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.33138976991176605, + "step": 862 + }, + { + "epoch": 0.054, + "grad_norm": 4.59375, + "grad_norm_var": 0.1000152587890625, + "learning_rate": 0.0001, + "loss": 9.412, + "loss/crossentropy": 2.5792051553726196, + "loss/hidden": 3.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.3428095132112503, + "step": 864 + }, + { + "epoch": 0.054125, + "grad_norm": 4.09375, + "grad_norm_var": 0.10471903483072917, + "learning_rate": 0.0001, + "loss": 9.4376, + "loss/crossentropy": 2.189521312713623, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3646356761455536, + "step": 866 + }, + { + "epoch": 0.05425, + "grad_norm": 6.4375, + "grad_norm_var": 0.37280171712239585, + "learning_rate": 0.0001, + "loss": 9.9946, + "loss/crossentropy": 2.4185925722122192, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.359613835811615, + "step": 868 + }, + { + "epoch": 0.054375, + "grad_norm": 5.09375, + "grad_norm_var": 0.4150299072265625, + "learning_rate": 0.0001, + "loss": 9.6361, + "loss/crossentropy": 2.419649362564087, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.33111467957496643, + "step": 870 + }, + { + "epoch": 0.0545, + "grad_norm": 5.09375, + "grad_norm_var": 0.4413970947265625, + "learning_rate": 0.0001, + "loss": 9.6636, + "loss/crossentropy": 2.4185843467712402, + "loss/hidden": 3.78125, + "loss/jsd": 0.0, + "loss/logits": 0.35009828209877014, + "step": 872 + }, + { + "epoch": 0.054625, + "grad_norm": 4.3125, + "grad_norm_var": 0.44112040201822916, + "learning_rate": 0.0001, + "loss": 9.6933, + "loss/crossentropy": 2.2782691717147827, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.388755202293396, + "step": 874 + }, + { + "epoch": 0.05475, + "grad_norm": 3.734375, + "grad_norm_var": 0.459912109375, + "learning_rate": 0.0001, + "loss": 9.7969, + "loss/crossentropy": 2.194816470146179, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.32018932700157166, + "step": 876 + }, + { + "epoch": 0.054875, + "grad_norm": 4.5625, + "grad_norm_var": 0.44416402180989584, + "learning_rate": 0.0001, + "loss": 9.6102, + "loss/crossentropy": 2.50557017326355, + "loss/hidden": 3.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.3518366515636444, + "step": 878 + }, + { + "epoch": 0.055, + "grad_norm": 4.78125, + "grad_norm_var": 0.42001546223958336, + "learning_rate": 0.0001, + "loss": 9.7176, + "loss/crossentropy": 2.4951841831207275, + "loss/hidden": 3.765625, + "loss/jsd": 0.0, + "loss/logits": 0.3479674905538559, + "step": 880 + }, + { + "epoch": 0.055125, + "grad_norm": 3.953125, + "grad_norm_var": 0.4462636311848958, + "learning_rate": 0.0001, + "loss": 9.2982, + "loss/crossentropy": 2.2794214487075806, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.33856503665447235, + "step": 882 + }, + { + "epoch": 0.05525, + "grad_norm": 4.4375, + "grad_norm_var": 0.20742899576822918, + "learning_rate": 0.0001, + "loss": 9.5027, + "loss/crossentropy": 2.420092821121216, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.32554225623607635, + "step": 884 + }, + { + "epoch": 0.055375, + "grad_norm": 4.5625, + "grad_norm_var": 0.1740386962890625, + "learning_rate": 0.0001, + "loss": 9.514, + "loss/crossentropy": 2.4305249452590942, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.3317463994026184, + "step": 886 + }, + { + "epoch": 0.0555, + "grad_norm": 5.03125, + "grad_norm_var": 0.1664215087890625, + "learning_rate": 0.0001, + "loss": 9.7395, + "loss/crossentropy": 2.3821409940719604, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.36465059220790863, + "step": 888 + }, + { + "epoch": 0.055625, + "grad_norm": 4.4375, + "grad_norm_var": 0.14842020670572917, + "learning_rate": 0.0001, + "loss": 9.2789, + "loss/crossentropy": 2.298262596130371, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.31890998780727386, + "step": 890 + }, + { + "epoch": 0.05575, + "grad_norm": 4.03125, + "grad_norm_var": 0.12892252604166668, + "learning_rate": 0.0001, + "loss": 9.7294, + "loss/crossentropy": 2.264374613761902, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.31738966703414917, + "step": 892 + }, + { + "epoch": 0.055875, + "grad_norm": 4.0625, + "grad_norm_var": 0.10719401041666667, + "learning_rate": 0.0001, + "loss": 9.669, + "loss/crossentropy": 2.557625889778137, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.37109945714473724, + "step": 894 + }, + { + "epoch": 0.056, + "grad_norm": 3.8125, + "grad_norm_var": 0.0967437744140625, + "learning_rate": 0.0001, + "loss": 9.3578, + "loss/crossentropy": 2.513554573059082, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.3437999337911606, + "step": 896 + }, + { + "epoch": 0.056125, + "grad_norm": 4.34375, + "grad_norm_var": 0.10321858723958334, + "learning_rate": 0.0001, + "loss": 9.6894, + "loss/crossentropy": 2.6883383989334106, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.34091816842556, + "step": 898 + }, + { + "epoch": 0.05625, + "grad_norm": 4.21875, + "grad_norm_var": 0.10778706868489583, + "learning_rate": 0.0001, + "loss": 9.5283, + "loss/crossentropy": 2.5178507566452026, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.31220842897892, + "step": 900 + }, + { + "epoch": 0.056375, + "grad_norm": 4.09375, + "grad_norm_var": 0.11022847493489583, + "learning_rate": 0.0001, + "loss": 9.4134, + "loss/crossentropy": 2.286848306655884, + "loss/hidden": 3.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.36784152686595917, + "step": 902 + }, + { + "epoch": 0.0565, + "grad_norm": 3.75, + "grad_norm_var": 0.08430887858072916, + "learning_rate": 0.0001, + "loss": 9.5666, + "loss/crossentropy": 2.330216407775879, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3340802788734436, + "step": 904 + }, + { + "epoch": 0.056625, + "grad_norm": 4.15625, + "grad_norm_var": 0.07683919270833334, + "learning_rate": 0.0001, + "loss": 9.4613, + "loss/crossentropy": 2.5783122777938843, + "loss/hidden": 3.765625, + "loss/jsd": 0.0, + "loss/logits": 0.3707699775695801, + "step": 906 + }, + { + "epoch": 0.05675, + "grad_norm": 4.0, + "grad_norm_var": 0.07731119791666667, + "learning_rate": 0.0001, + "loss": 9.6405, + "loss/crossentropy": 2.39057457447052, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.31178198754787445, + "step": 908 + }, + { + "epoch": 0.056875, + "grad_norm": 4.0625, + "grad_norm_var": 0.075927734375, + "learning_rate": 0.0001, + "loss": 9.3792, + "loss/crossentropy": 2.2321670055389404, + "loss/hidden": 3.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.39315053820610046, + "step": 910 + }, + { + "epoch": 0.057, + "grad_norm": 4.125, + "grad_norm_var": 0.06603190104166666, + "learning_rate": 0.0001, + "loss": 9.4016, + "loss/crossentropy": 2.457381010055542, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3697053790092468, + "step": 912 + }, + { + "epoch": 0.057125, + "grad_norm": 4.21875, + "grad_norm_var": 0.05308837890625, + "learning_rate": 0.0001, + "loss": 9.705, + "loss/crossentropy": 2.3566343784332275, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.31618085503578186, + "step": 914 + }, + { + "epoch": 0.05725, + "grad_norm": 3.734375, + "grad_norm_var": 0.05917561848958333, + "learning_rate": 0.0001, + "loss": 9.2448, + "loss/crossentropy": 2.349318027496338, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.3112690597772598, + "step": 916 + }, + { + "epoch": 0.057375, + "grad_norm": 5.0625, + "grad_norm_var": 0.09866129557291667, + "learning_rate": 0.0001, + "loss": 9.7381, + "loss/crossentropy": 2.605436682701111, + "loss/hidden": 3.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.38693949580192566, + "step": 918 + }, + { + "epoch": 0.0575, + "grad_norm": 4.25, + "grad_norm_var": 0.08850504557291666, + "learning_rate": 0.0001, + "loss": 9.36, + "loss/crossentropy": 2.3533878326416016, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.32768990099430084, + "step": 920 + }, + { + "epoch": 0.057625, + "grad_norm": 3.71875, + "grad_norm_var": 0.13023173014322917, + "learning_rate": 0.0001, + "loss": 9.4501, + "loss/crossentropy": 2.4686715602874756, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.314393013715744, + "step": 922 + }, + { + "epoch": 0.05775, + "grad_norm": 3.640625, + "grad_norm_var": 0.16578776041666668, + "learning_rate": 0.0001, + "loss": 9.0475, + "loss/crossentropy": 2.204137921333313, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.30986711382865906, + "step": 924 + }, + { + "epoch": 0.057875, + "grad_norm": 3.65625, + "grad_norm_var": 0.1955230712890625, + "learning_rate": 0.0001, + "loss": 9.2625, + "loss/crossentropy": 2.505138397216797, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.3187219649553299, + "step": 926 + }, + { + "epoch": 0.058, + "grad_norm": 4.09375, + "grad_norm_var": 0.19621988932291667, + "learning_rate": 0.0001, + "loss": 9.2882, + "loss/crossentropy": 2.4183324575424194, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.3302215486764908, + "step": 928 + }, + { + "epoch": 0.058125, + "grad_norm": 4.03125, + "grad_norm_var": 0.179736328125, + "learning_rate": 0.0001, + "loss": 9.5348, + "loss/crossentropy": 2.4528021812438965, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.34695957601070404, + "step": 930 + }, + { + "epoch": 0.05825, + "grad_norm": 4.1875, + "grad_norm_var": 0.18042704264322917, + "learning_rate": 0.0001, + "loss": 9.3228, + "loss/crossentropy": 2.2103404998779297, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.3665204644203186, + "step": 932 + }, + { + "epoch": 0.058375, + "grad_norm": 4.15625, + "grad_norm_var": 0.10943094889322917, + "learning_rate": 0.0001, + "loss": 9.3404, + "loss/crossentropy": 2.180467367172241, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.299451008439064, + "step": 934 + }, + { + "epoch": 0.0585, + "grad_norm": 3.703125, + "grad_norm_var": 0.11096598307291666, + "learning_rate": 0.0001, + "loss": 9.3411, + "loss/crossentropy": 2.7028924226760864, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.35058994591236115, + "step": 936 + }, + { + "epoch": 0.058625, + "grad_norm": 4.0, + "grad_norm_var": 0.04487202962239583, + "learning_rate": 0.0001, + "loss": 9.3285, + "loss/crossentropy": 2.4909303188323975, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.33308878540992737, + "step": 938 + }, + { + "epoch": 0.05875, + "grad_norm": 3.90625, + "grad_norm_var": 0.03717041015625, + "learning_rate": 0.0001, + "loss": 9.4385, + "loss/crossentropy": 2.3014419078826904, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.3322508633136749, + "step": 940 + }, + { + "epoch": 0.058875, + "grad_norm": 4.09375, + "grad_norm_var": 0.0290435791015625, + "learning_rate": 0.0001, + "loss": 9.5862, + "loss/crossentropy": 2.5005375146865845, + "loss/hidden": 3.765625, + "loss/jsd": 0.0, + "loss/logits": 0.34335020184516907, + "step": 942 + }, + { + "epoch": 0.059, + "grad_norm": 5.0, + "grad_norm_var": 0.10372721354166667, + "learning_rate": 0.0001, + "loss": 9.4993, + "loss/crossentropy": 2.428452491760254, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.3558335155248642, + "step": 944 + }, + { + "epoch": 0.059125, + "grad_norm": 4.46875, + "grad_norm_var": 0.11492513020833334, + "learning_rate": 0.0001, + "loss": 9.3561, + "loss/crossentropy": 2.450140953063965, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.32736936211586, + "step": 946 + }, + { + "epoch": 0.05925, + "grad_norm": 3.609375, + "grad_norm_var": 0.12048238118489583, + "learning_rate": 0.0001, + "loss": 9.4383, + "loss/crossentropy": 2.4876564741134644, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.31698183715343475, + "step": 948 + }, + { + "epoch": 0.059375, + "grad_norm": 4.21875, + "grad_norm_var": 0.12451883951822916, + "learning_rate": 0.0001, + "loss": 9.6831, + "loss/crossentropy": 2.384592890739441, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.32197698950767517, + "step": 950 + }, + { + "epoch": 0.0595, + "grad_norm": 3.59375, + "grad_norm_var": 0.12683919270833333, + "learning_rate": 0.0001, + "loss": 9.2944, + "loss/crossentropy": 2.35392427444458, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.36352527141571045, + "step": 952 + }, + { + "epoch": 0.059625, + "grad_norm": 4.0625, + "grad_norm_var": 0.11980794270833334, + "learning_rate": 0.0001, + "loss": 9.0958, + "loss/crossentropy": 2.4466415643692017, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.30798208713531494, + "step": 954 + }, + { + "epoch": 0.05975, + "grad_norm": 4.375, + "grad_norm_var": 0.11995442708333333, + "learning_rate": 0.0001, + "loss": 9.4957, + "loss/crossentropy": 2.5927644968032837, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.33347761631011963, + "step": 956 + }, + { + "epoch": 0.059875, + "grad_norm": 4.3125, + "grad_norm_var": 0.12353413899739583, + "learning_rate": 0.0001, + "loss": 9.3342, + "loss/crossentropy": 2.4658687114715576, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.3318018615245819, + "step": 958 + }, + { + "epoch": 0.06, + "grad_norm": 4.28125, + "grad_norm_var": 0.06822001139322917, + "learning_rate": 0.0001, + "loss": 9.4473, + "loss/crossentropy": 2.4019787311553955, + "loss/hidden": 3.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.3761335462331772, + "step": 960 + }, + { + "epoch": 0.060125, + "grad_norm": 4.46875, + "grad_norm_var": 0.06902567545572917, + "learning_rate": 0.0001, + "loss": 9.4095, + "loss/crossentropy": 2.6830371618270874, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.32209448516368866, + "step": 962 + }, + { + "epoch": 0.06025, + "grad_norm": 3.875, + "grad_norm_var": 0.059716796875, + "learning_rate": 0.0001, + "loss": 9.3998, + "loss/crossentropy": 2.283499240875244, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3431689292192459, + "step": 964 + }, + { + "epoch": 0.060375, + "grad_norm": 3.8125, + "grad_norm_var": 0.0599761962890625, + "learning_rate": 0.0001, + "loss": 9.12, + "loss/crossentropy": 2.146597146987915, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.32681937515735626, + "step": 966 + }, + { + "epoch": 0.0605, + "grad_norm": 3.796875, + "grad_norm_var": 0.048127237955729166, + "learning_rate": 0.0001, + "loss": 9.2878, + "loss/crossentropy": 2.3824340105056763, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.32292675971984863, + "step": 968 + }, + { + "epoch": 0.060625, + "grad_norm": 3.875, + "grad_norm_var": 0.060465494791666664, + "learning_rate": 0.0001, + "loss": 9.1892, + "loss/crossentropy": 2.3321211338043213, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3173847645521164, + "step": 970 + }, + { + "epoch": 0.06075, + "grad_norm": 4.09375, + "grad_norm_var": 0.05028889973958333, + "learning_rate": 0.0001, + "loss": 9.5526, + "loss/crossentropy": 2.5641666650772095, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.35992586612701416, + "step": 972 + }, + { + "epoch": 0.060875, + "grad_norm": 3.984375, + "grad_norm_var": 0.04504801432291667, + "learning_rate": 0.0001, + "loss": 9.3705, + "loss/crossentropy": 2.34674608707428, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.32495957612991333, + "step": 974 + }, + { + "epoch": 0.061, + "grad_norm": 4.03125, + "grad_norm_var": 0.065625, + "learning_rate": 0.0001, + "loss": 9.3391, + "loss/crossentropy": 2.447916865348816, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.3161363750696182, + "step": 976 + }, + { + "epoch": 0.061125, + "grad_norm": 3.796875, + "grad_norm_var": 0.0483062744140625, + "learning_rate": 0.0001, + "loss": 9.3675, + "loss/crossentropy": 2.361741304397583, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.30326806008815765, + "step": 978 + }, + { + "epoch": 0.06125, + "grad_norm": 3.90625, + "grad_norm_var": 0.05650126139322917, + "learning_rate": 0.0001, + "loss": 9.3554, + "loss/crossentropy": 2.4683319330215454, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.34657470881938934, + "step": 980 + }, + { + "epoch": 0.061375, + "grad_norm": 3.828125, + "grad_norm_var": 0.05673421223958333, + "learning_rate": 0.0001, + "loss": 9.229, + "loss/crossentropy": 2.26959490776062, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.3553328216075897, + "step": 982 + }, + { + "epoch": 0.0615, + "grad_norm": 4.03125, + "grad_norm_var": 0.0560455322265625, + "learning_rate": 0.0001, + "loss": 9.2883, + "loss/crossentropy": 2.5307698249816895, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.31306394934654236, + "step": 984 + }, + { + "epoch": 0.061625, + "grad_norm": 5.0625, + "grad_norm_var": 0.16921284993489583, + "learning_rate": 0.0001, + "loss": 9.4596, + "loss/crossentropy": 2.325950264930725, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.3328956216573715, + "step": 986 + }, + { + "epoch": 0.06175, + "grad_norm": 3.6875, + "grad_norm_var": 0.18103739420572917, + "learning_rate": 0.0001, + "loss": 9.2764, + "loss/crossentropy": 2.3399884700775146, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.3336493968963623, + "step": 988 + }, + { + "epoch": 0.061875, + "grad_norm": 3.84375, + "grad_norm_var": 0.18277079264322918, + "learning_rate": 0.0001, + "loss": 9.2997, + "loss/crossentropy": 2.4344476461410522, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.31361398100852966, + "step": 990 + }, + { + "epoch": 0.062, + "grad_norm": 3.984375, + "grad_norm_var": 0.17678629557291667, + "learning_rate": 0.0001, + "loss": 9.4531, + "loss/crossentropy": 2.532125949859619, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.31050997972488403, + "step": 992 + }, + { + "epoch": 0.062125, + "grad_norm": 5.625, + "grad_norm_var": 0.3181711832682292, + "learning_rate": 0.0001, + "loss": 9.3085, + "loss/crossentropy": 2.205925226211548, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.3132568746805191, + "step": 994 + }, + { + "epoch": 0.06225, + "grad_norm": 4.5, + "grad_norm_var": 0.29087626139322914, + "learning_rate": 0.0001, + "loss": 9.4214, + "loss/crossentropy": 2.3645405769348145, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.33257976174354553, + "step": 996 + }, + { + "epoch": 0.062375, + "grad_norm": 4.125, + "grad_norm_var": 0.2752349853515625, + "learning_rate": 0.0001, + "loss": 9.248, + "loss/crossentropy": 2.6228253841400146, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.3128903806209564, + "step": 998 + }, + { + "epoch": 0.0625, + "grad_norm": 3.828125, + "grad_norm_var": 0.2778472900390625, + "learning_rate": 0.0001, + "loss": 9.3876, + "loss/crossentropy": 2.4406707286834717, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3451492637395859, + "step": 1000 + }, + { + "epoch": 0.062625, + "grad_norm": 3.625, + "grad_norm_var": 0.22431233723958333, + "learning_rate": 0.0001, + "loss": 9.3272, + "loss/crossentropy": 2.1737005710601807, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2996635288000107, + "step": 1002 + }, + { + "epoch": 0.06275, + "grad_norm": 3.671875, + "grad_norm_var": 0.22522684733072917, + "learning_rate": 0.0001, + "loss": 9.1276, + "loss/crossentropy": 2.3624621629714966, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.32320792973041534, + "step": 1004 + }, + { + "epoch": 0.062875, + "grad_norm": 3.84375, + "grad_norm_var": 0.22323811848958333, + "learning_rate": 0.0001, + "loss": 9.1599, + "loss/crossentropy": 2.163053512573242, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.3028685748577118, + "step": 1006 + }, + { + "epoch": 0.063, + "grad_norm": 3.75, + "grad_norm_var": 0.21956278483072916, + "learning_rate": 0.0001, + "loss": 9.0966, + "loss/crossentropy": 2.3403860330581665, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.31008927524089813, + "step": 1008 + }, + { + "epoch": 0.063125, + "grad_norm": 4.25, + "grad_norm_var": 0.052098592122395836, + "learning_rate": 0.0001, + "loss": 9.3055, + "loss/crossentropy": 2.5604758262634277, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.3352798819541931, + "step": 1010 + }, + { + "epoch": 0.06325, + "grad_norm": 4.0, + "grad_norm_var": 0.0339263916015625, + "learning_rate": 0.0001, + "loss": 9.2584, + "loss/crossentropy": 2.5583336353302, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.323485866189003, + "step": 1012 + }, + { + "epoch": 0.063375, + "grad_norm": 3.53125, + "grad_norm_var": 0.042626953125, + "learning_rate": 0.0001, + "loss": 9.2338, + "loss/crossentropy": 2.4750031232833862, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.32088278234004974, + "step": 1014 + }, + { + "epoch": 0.0635, + "grad_norm": 4.28125, + "grad_norm_var": 0.048005167643229166, + "learning_rate": 0.0001, + "loss": 9.4657, + "loss/crossentropy": 2.3103621006011963, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3347797989845276, + "step": 1016 + }, + { + "epoch": 0.063625, + "grad_norm": 3.890625, + "grad_norm_var": 0.04117431640625, + "learning_rate": 0.0001, + "loss": 9.2284, + "loss/crossentropy": 2.281406044960022, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.33119070529937744, + "step": 1018 + }, + { + "epoch": 0.06375, + "grad_norm": 3.671875, + "grad_norm_var": 0.043822224934895834, + "learning_rate": 0.0001, + "loss": 9.1061, + "loss/crossentropy": 2.3903090953826904, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.30517514050006866, + "step": 1020 + }, + { + "epoch": 0.063875, + "grad_norm": 3.828125, + "grad_norm_var": 0.04397786458333333, + "learning_rate": 0.0001, + "loss": 9.3342, + "loss/crossentropy": 2.4621089696884155, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.3505241721868515, + "step": 1022 + }, + { + "epoch": 0.064, + "grad_norm": 3.734375, + "grad_norm_var": 0.051813761393229164, + "learning_rate": 0.0001, + "loss": 9.1248, + "loss/crossentropy": 2.3870365619659424, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.3047706037759781, + "step": 1024 + }, + { + "epoch": 0.064125, + "grad_norm": 4.09375, + "grad_norm_var": 0.053564453125, + "learning_rate": 0.0001, + "loss": 9.1846, + "loss/crossentropy": 2.7421722412109375, + "loss/hidden": 3.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.3265855461359024, + "step": 1026 + }, + { + "epoch": 0.06425, + "grad_norm": 3.703125, + "grad_norm_var": 0.0537261962890625, + "learning_rate": 0.0001, + "loss": 9.2907, + "loss/crossentropy": 2.296812057495117, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2955050766468048, + "step": 1028 + }, + { + "epoch": 0.064375, + "grad_norm": 4.09375, + "grad_norm_var": 0.0503570556640625, + "learning_rate": 0.0001, + "loss": 9.3552, + "loss/crossentropy": 2.6835397481918335, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.33152663707733154, + "step": 1030 + }, + { + "epoch": 0.0645, + "grad_norm": 4.15625, + "grad_norm_var": 0.0430084228515625, + "learning_rate": 0.0001, + "loss": 9.3363, + "loss/crossentropy": 2.644715666770935, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3512026369571686, + "step": 1032 + }, + { + "epoch": 0.064625, + "grad_norm": 4.15625, + "grad_norm_var": 0.0596343994140625, + "learning_rate": 0.0001, + "loss": 9.4068, + "loss/crossentropy": 2.424636483192444, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3079882860183716, + "step": 1034 + }, + { + "epoch": 0.06475, + "grad_norm": 3.96875, + "grad_norm_var": 0.05446675618489583, + "learning_rate": 0.0001, + "loss": 9.4486, + "loss/crossentropy": 2.347719192504883, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.3452465981245041, + "step": 1036 + }, + { + "epoch": 0.064875, + "grad_norm": 3.765625, + "grad_norm_var": 0.07506103515625, + "learning_rate": 0.0001, + "loss": 8.9964, + "loss/crossentropy": 2.0868254899978638, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.3242860585451126, + "step": 1038 + }, + { + "epoch": 0.065, + "grad_norm": 5.40625, + "grad_norm_var": 0.21399637858072917, + "learning_rate": 0.0001, + "loss": 9.4657, + "loss/crossentropy": 2.4020437002182007, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.3079642355442047, + "step": 1040 + }, + { + "epoch": 0.065125, + "grad_norm": 3.921875, + "grad_norm_var": 0.19934488932291666, + "learning_rate": 0.0001, + "loss": 9.4337, + "loss/crossentropy": 2.4271044731140137, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3291940689086914, + "step": 1042 + }, + { + "epoch": 0.06525, + "grad_norm": 3.765625, + "grad_norm_var": 0.19329020182291667, + "learning_rate": 0.0001, + "loss": 9.196, + "loss/crossentropy": 2.3336949348449707, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.3126966655254364, + "step": 1044 + }, + { + "epoch": 0.065375, + "grad_norm": 3.6875, + "grad_norm_var": 0.19695638020833334, + "learning_rate": 0.0001, + "loss": 9.4016, + "loss/crossentropy": 2.5471415519714355, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.33399492502212524, + "step": 1046 + }, + { + "epoch": 0.0655, + "grad_norm": 3.984375, + "grad_norm_var": 0.19744364420572916, + "learning_rate": 0.0001, + "loss": 9.0525, + "loss/crossentropy": 2.028559982776642, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.3023695796728134, + "step": 1048 + }, + { + "epoch": 0.065625, + "grad_norm": 3.9375, + "grad_norm_var": 0.1896392822265625, + "learning_rate": 0.0001, + "loss": 9.2038, + "loss/crossentropy": 2.2506083250045776, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.3274885416030884, + "step": 1050 + }, + { + "epoch": 0.06575, + "grad_norm": 4.0, + "grad_norm_var": 0.18593343098958334, + "learning_rate": 0.0001, + "loss": 9.3255, + "loss/crossentropy": 2.6331071853637695, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.30817919969558716, + "step": 1052 + }, + { + "epoch": 0.065875, + "grad_norm": 3.875, + "grad_norm_var": 0.15797526041666668, + "learning_rate": 0.0001, + "loss": 9.2451, + "loss/crossentropy": 2.354863405227661, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.33920133113861084, + "step": 1054 + }, + { + "epoch": 0.066, + "grad_norm": 3.90625, + "grad_norm_var": 0.015315755208333334, + "learning_rate": 0.0001, + "loss": 9.3564, + "loss/crossentropy": 2.6202335357666016, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3073268234729767, + "step": 1056 + }, + { + "epoch": 0.066125, + "grad_norm": 3.796875, + "grad_norm_var": 0.0164703369140625, + "learning_rate": 0.0001, + "loss": 9.1658, + "loss/crossentropy": 2.302557349205017, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.31188826262950897, + "step": 1058 + }, + { + "epoch": 0.06625, + "grad_norm": 3.453125, + "grad_norm_var": 0.027293904622395834, + "learning_rate": 0.0001, + "loss": 9.131, + "loss/crossentropy": 2.514571189880371, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.3184218853712082, + "step": 1060 + }, + { + "epoch": 0.066375, + "grad_norm": 3.859375, + "grad_norm_var": 0.025162760416666666, + "learning_rate": 0.0001, + "loss": 9.2056, + "loss/crossentropy": 2.264451503753662, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.34064817428588867, + "step": 1062 + }, + { + "epoch": 0.0665, + "grad_norm": 3.9375, + "grad_norm_var": 0.018180338541666667, + "learning_rate": 0.0001, + "loss": 9.0521, + "loss/crossentropy": 2.342800498008728, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.33910292387008667, + "step": 1064 + }, + { + "epoch": 0.066625, + "grad_norm": 3.90625, + "grad_norm_var": 0.021320597330729166, + "learning_rate": 0.0001, + "loss": 9.3295, + "loss/crossentropy": 2.5191909074783325, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.34528471529483795, + "step": 1066 + }, + { + "epoch": 0.06675, + "grad_norm": 3.90625, + "grad_norm_var": 0.020930989583333334, + "learning_rate": 0.0001, + "loss": 9.2792, + "loss/crossentropy": 2.6589291095733643, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.3547402173280716, + "step": 1068 + }, + { + "epoch": 0.066875, + "grad_norm": 3.59375, + "grad_norm_var": 0.023167928059895832, + "learning_rate": 0.0001, + "loss": 9.0386, + "loss/crossentropy": 2.1663339138031006, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.3398684561252594, + "step": 1070 + }, + { + "epoch": 0.067, + "grad_norm": 4.46875, + "grad_norm_var": 0.06620992024739583, + "learning_rate": 0.0001, + "loss": 9.538, + "loss/crossentropy": 2.518619418144226, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.33177968859672546, + "step": 1072 + }, + { + "epoch": 0.067125, + "grad_norm": 3.9375, + "grad_norm_var": 0.06520894368489584, + "learning_rate": 0.0001, + "loss": 9.4663, + "loss/crossentropy": 2.564071536064148, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.3523600995540619, + "step": 1074 + }, + { + "epoch": 0.06725, + "grad_norm": 3.65625, + "grad_norm_var": 0.056868489583333334, + "learning_rate": 0.0001, + "loss": 9.1435, + "loss/crossentropy": 2.314103364944458, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.3031466454267502, + "step": 1076 + }, + { + "epoch": 0.067375, + "grad_norm": 3.96875, + "grad_norm_var": 0.0743072509765625, + "learning_rate": 0.0001, + "loss": 9.0273, + "loss/crossentropy": 2.397694706916809, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.293083518743515, + "step": 1078 + }, + { + "epoch": 0.0675, + "grad_norm": 4.65625, + "grad_norm_var": 0.11435139973958333, + "learning_rate": 0.0001, + "loss": 9.6091, + "loss/crossentropy": 2.3738266229629517, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.35242393612861633, + "step": 1080 + }, + { + "epoch": 0.067625, + "grad_norm": 3.953125, + "grad_norm_var": 0.1121490478515625, + "learning_rate": 0.0001, + "loss": 9.3609, + "loss/crossentropy": 2.5533446073532104, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.3422684669494629, + "step": 1082 + }, + { + "epoch": 0.06775, + "grad_norm": 3.78125, + "grad_norm_var": 0.10871988932291667, + "learning_rate": 0.0001, + "loss": 9.3206, + "loss/crossentropy": 2.405779242515564, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.310599148273468, + "step": 1084 + }, + { + "epoch": 0.067875, + "grad_norm": 3.953125, + "grad_norm_var": 0.10204976399739583, + "learning_rate": 0.0001, + "loss": 9.0713, + "loss/crossentropy": 2.2090498208999634, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.3123241662979126, + "step": 1086 + }, + { + "epoch": 0.068, + "grad_norm": 3.75, + "grad_norm_var": 0.0724761962890625, + "learning_rate": 0.0001, + "loss": 9.0476, + "loss/crossentropy": 2.280885696411133, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2895790636539459, + "step": 1088 + }, + { + "epoch": 0.068125, + "grad_norm": 3.78125, + "grad_norm_var": 0.0732330322265625, + "learning_rate": 0.0001, + "loss": 9.2207, + "loss/crossentropy": 2.337521195411682, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.3371659815311432, + "step": 1090 + }, + { + "epoch": 0.06825, + "grad_norm": 3.578125, + "grad_norm_var": 0.08389383951822917, + "learning_rate": 0.0001, + "loss": 8.9717, + "loss/crossentropy": 2.358444333076477, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.34108731150627136, + "step": 1092 + }, + { + "epoch": 0.068375, + "grad_norm": 4.09375, + "grad_norm_var": 0.07141520182291666, + "learning_rate": 0.0001, + "loss": 9.0686, + "loss/crossentropy": 2.2623904943466187, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3080063462257385, + "step": 1094 + }, + { + "epoch": 0.0685, + "grad_norm": 4.03125, + "grad_norm_var": 0.029084269205729166, + "learning_rate": 0.0001, + "loss": 9.4291, + "loss/crossentropy": 2.2225699424743652, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.28325292468070984, + "step": 1096 + }, + { + "epoch": 0.068625, + "grad_norm": 3.6875, + "grad_norm_var": 0.02789306640625, + "learning_rate": 0.0001, + "loss": 9.258, + "loss/crossentropy": 2.362979292869568, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.3146945536136627, + "step": 1098 + }, + { + "epoch": 0.06875, + "grad_norm": 3.9375, + "grad_norm_var": 0.035008748372395836, + "learning_rate": 0.0001, + "loss": 9.0957, + "loss/crossentropy": 2.3709558248519897, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.2997266799211502, + "step": 1100 + }, + { + "epoch": 0.068875, + "grad_norm": 3.421875, + "grad_norm_var": 0.049637858072916666, + "learning_rate": 0.0001, + "loss": 8.8482, + "loss/crossentropy": 2.1995412707328796, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.31232061982154846, + "step": 1102 + }, + { + "epoch": 0.069, + "grad_norm": 4.96875, + "grad_norm_var": 0.13362630208333334, + "learning_rate": 0.0001, + "loss": 9.3721, + "loss/crossentropy": 2.179778814315796, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.29335537552833557, + "step": 1104 + }, + { + "epoch": 0.069125, + "grad_norm": 4.3125, + "grad_norm_var": 0.35261128743489584, + "learning_rate": 0.0001, + "loss": 9.3248, + "loss/crossentropy": 2.369896650314331, + "loss/hidden": 3.703125, + "loss/jsd": 0.0, + "loss/logits": 0.3417474627494812, + "step": 1106 + }, + { + "epoch": 0.06925, + "grad_norm": 4.21875, + "grad_norm_var": 0.32355143229166666, + "learning_rate": 0.0001, + "loss": 9.422, + "loss/crossentropy": 2.6357239484786987, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.33385856449604034, + "step": 1108 + }, + { + "epoch": 0.069375, + "grad_norm": 4.1875, + "grad_norm_var": 0.33131103515625, + "learning_rate": 0.0001, + "loss": 9.3892, + "loss/crossentropy": 2.6354408264160156, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.33535902202129364, + "step": 1110 + }, + { + "epoch": 0.0695, + "grad_norm": 3.859375, + "grad_norm_var": 0.3421712239583333, + "learning_rate": 0.0001, + "loss": 9.3288, + "loss/crossentropy": 2.2603734731674194, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.31547120213508606, + "step": 1112 + }, + { + "epoch": 0.069625, + "grad_norm": 3.625, + "grad_norm_var": 0.3455556233723958, + "learning_rate": 0.0001, + "loss": 9.0927, + "loss/crossentropy": 2.2856796979904175, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.31341737508773804, + "step": 1114 + }, + { + "epoch": 0.06975, + "grad_norm": 3.765625, + "grad_norm_var": 0.3824208577473958, + "learning_rate": 0.0001, + "loss": 9.065, + "loss/crossentropy": 2.3744817972183228, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.3295013904571533, + "step": 1116 + }, + { + "epoch": 0.069875, + "grad_norm": 3.953125, + "grad_norm_var": 0.33152567545572914, + "learning_rate": 0.0001, + "loss": 9.286, + "loss/crossentropy": 2.4832775592803955, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.3211631774902344, + "step": 1118 + }, + { + "epoch": 0.07, + "grad_norm": 3.640625, + "grad_norm_var": 0.2992177327473958, + "learning_rate": 0.0001, + "loss": 9.2446, + "loss/crossentropy": 2.6656835079193115, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.33588290214538574, + "step": 1120 + }, + { + "epoch": 0.070125, + "grad_norm": 4.125, + "grad_norm_var": 0.09185282389322917, + "learning_rate": 0.0001, + "loss": 9.1868, + "loss/crossentropy": 2.418588876724243, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.36348558962345123, + "step": 1122 + }, + { + "epoch": 0.07025, + "grad_norm": 4.09375, + "grad_norm_var": 0.08717041015625, + "learning_rate": 0.0001, + "loss": 9.1917, + "loss/crossentropy": 2.309618353843689, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.2983853369951248, + "step": 1124 + }, + { + "epoch": 0.070375, + "grad_norm": 4.03125, + "grad_norm_var": 0.0684478759765625, + "learning_rate": 0.0001, + "loss": 9.0934, + "loss/crossentropy": 2.466736316680908, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.31100137531757355, + "step": 1126 + }, + { + "epoch": 0.0705, + "grad_norm": 3.984375, + "grad_norm_var": 0.10739644368489583, + "learning_rate": 0.0001, + "loss": 9.3913, + "loss/crossentropy": 2.4813402891159058, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.32681819796562195, + "step": 1128 + }, + { + "epoch": 0.070625, + "grad_norm": 3.53125, + "grad_norm_var": 0.1125152587890625, + "learning_rate": 0.0001, + "loss": 9.2876, + "loss/crossentropy": 2.6551177501678467, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.30686071515083313, + "step": 1130 + }, + { + "epoch": 0.07075, + "grad_norm": 3.859375, + "grad_norm_var": 0.10204671223958334, + "learning_rate": 0.0001, + "loss": 9.164, + "loss/crossentropy": 2.266343593597412, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.29779092967510223, + "step": 1132 + }, + { + "epoch": 0.070875, + "grad_norm": 4.3125, + "grad_norm_var": 0.11245829264322917, + "learning_rate": 0.0001, + "loss": 9.2955, + "loss/crossentropy": 2.3365002870559692, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.3126705288887024, + "step": 1134 + }, + { + "epoch": 0.071, + "grad_norm": 3.90625, + "grad_norm_var": 0.1045318603515625, + "learning_rate": 0.0001, + "loss": 9.0178, + "loss/crossentropy": 2.469061851501465, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.32917141914367676, + "step": 1136 + }, + { + "epoch": 0.071125, + "grad_norm": 3.5, + "grad_norm_var": 0.12392171223958333, + "learning_rate": 0.0001, + "loss": 8.9858, + "loss/crossentropy": 2.5383065938949585, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.31508152186870575, + "step": 1138 + }, + { + "epoch": 0.07125, + "grad_norm": 3.703125, + "grad_norm_var": 0.1242828369140625, + "learning_rate": 0.0001, + "loss": 9.3816, + "loss/crossentropy": 2.7282421588897705, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.329195499420166, + "step": 1140 + }, + { + "epoch": 0.071375, + "grad_norm": 4.125, + "grad_norm_var": 0.1225982666015625, + "learning_rate": 0.0001, + "loss": 9.5541, + "loss/crossentropy": 2.4712259769439697, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.3016209304332733, + "step": 1142 + }, + { + "epoch": 0.0715, + "grad_norm": 3.9375, + "grad_norm_var": 0.057291666666666664, + "learning_rate": 0.0001, + "loss": 9.1036, + "loss/crossentropy": 2.4540599584579468, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.31999677419662476, + "step": 1144 + }, + { + "epoch": 0.071625, + "grad_norm": 4.21875, + "grad_norm_var": 0.053465779622395834, + "learning_rate": 0.0001, + "loss": 9.1462, + "loss/crossentropy": 2.808298349380493, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.3199266195297241, + "step": 1146 + }, + { + "epoch": 0.07175, + "grad_norm": 3.796875, + "grad_norm_var": 0.06948954264322917, + "learning_rate": 0.0001, + "loss": 9.3125, + "loss/crossentropy": 2.5488197803497314, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.34709227085113525, + "step": 1148 + }, + { + "epoch": 0.071875, + "grad_norm": 3.328125, + "grad_norm_var": 0.08771158854166666, + "learning_rate": 0.0001, + "loss": 8.9519, + "loss/crossentropy": 2.3145110607147217, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.3027106672525406, + "step": 1150 + }, + { + "epoch": 0.072, + "grad_norm": 5.375, + "grad_norm_var": 0.23205973307291666, + "learning_rate": 0.0001, + "loss": 9.251, + "loss/crossentropy": 2.2240471839904785, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.336179718375206, + "step": 1152 + }, + { + "epoch": 0.072125, + "grad_norm": 3.90625, + "grad_norm_var": 0.21687723795572916, + "learning_rate": 0.0001, + "loss": 9.1314, + "loss/crossentropy": 2.105097532272339, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.3255419135093689, + "step": 1154 + }, + { + "epoch": 0.07225, + "grad_norm": 4.03125, + "grad_norm_var": 0.21018778483072917, + "learning_rate": 0.0001, + "loss": 9.0524, + "loss/crossentropy": 2.4139981269836426, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.3454781472682953, + "step": 1156 + }, + { + "epoch": 0.072375, + "grad_norm": 3.78125, + "grad_norm_var": 0.21177469889322917, + "learning_rate": 0.0001, + "loss": 9.1533, + "loss/crossentropy": 2.3245939016342163, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.28686030209064484, + "step": 1158 + }, + { + "epoch": 0.0725, + "grad_norm": 3.765625, + "grad_norm_var": 0.22030843098958333, + "learning_rate": 0.0001, + "loss": 9.2865, + "loss/crossentropy": 2.5649216175079346, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.3485357314348221, + "step": 1160 + }, + { + "epoch": 0.072625, + "grad_norm": 3.859375, + "grad_norm_var": 0.21592508951822917, + "learning_rate": 0.0001, + "loss": 9.4127, + "loss/crossentropy": 2.467991352081299, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.3160312622785568, + "step": 1162 + }, + { + "epoch": 0.07275, + "grad_norm": 3.796875, + "grad_norm_var": 0.20200907389322917, + "learning_rate": 0.0001, + "loss": 9.208, + "loss/crossentropy": 2.4666264057159424, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.32405731081962585, + "step": 1164 + }, + { + "epoch": 0.072875, + "grad_norm": 3.453125, + "grad_norm_var": 0.17566630045572917, + "learning_rate": 0.0001, + "loss": 8.9344, + "loss/crossentropy": 2.497612714767456, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.2962990552186966, + "step": 1166 + }, + { + "epoch": 0.073, + "grad_norm": 4.21875, + "grad_norm_var": 0.07573140462239583, + "learning_rate": 0.0001, + "loss": 9.2981, + "loss/crossentropy": 2.3695040941238403, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.321966215968132, + "step": 1168 + }, + { + "epoch": 0.073125, + "grad_norm": 3.984375, + "grad_norm_var": 0.07278645833333333, + "learning_rate": 0.0001, + "loss": 9.348, + "loss/crossentropy": 2.399674415588379, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.3187423199415207, + "step": 1170 + }, + { + "epoch": 0.07325, + "grad_norm": 4.09375, + "grad_norm_var": 0.07532552083333334, + "learning_rate": 0.0001, + "loss": 9.2577, + "loss/crossentropy": 2.395334005355835, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.3417189121246338, + "step": 1172 + }, + { + "epoch": 0.073375, + "grad_norm": 3.5, + "grad_norm_var": 0.08810933430989583, + "learning_rate": 0.0001, + "loss": 8.7726, + "loss/crossentropy": 2.0574229955673218, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.30047091841697693, + "step": 1174 + }, + { + "epoch": 0.0735, + "grad_norm": 3.59375, + "grad_norm_var": 0.08876953125, + "learning_rate": 0.0001, + "loss": 9.1649, + "loss/crossentropy": 2.3426761627197266, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.31436459720134735, + "step": 1176 + }, + { + "epoch": 0.073625, + "grad_norm": 4.03125, + "grad_norm_var": 0.0902984619140625, + "learning_rate": 0.0001, + "loss": 9.3917, + "loss/crossentropy": 2.4197838306427, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.30905722081661224, + "step": 1178 + }, + { + "epoch": 0.07375, + "grad_norm": 4.6875, + "grad_norm_var": 0.12220052083333334, + "learning_rate": 0.0001, + "loss": 9.1579, + "loss/crossentropy": 2.3440288305282593, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.3073788434267044, + "step": 1180 + }, + { + "epoch": 0.073875, + "grad_norm": 4.0625, + "grad_norm_var": 0.10395406087239584, + "learning_rate": 0.0001, + "loss": 9.3372, + "loss/crossentropy": 2.3033924102783203, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.34168997406959534, + "step": 1182 + }, + { + "epoch": 0.074, + "grad_norm": 4.25, + "grad_norm_var": 0.0841705322265625, + "learning_rate": 0.0001, + "loss": 9.4208, + "loss/crossentropy": 2.6207507848739624, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.34534966945648193, + "step": 1184 + }, + { + "epoch": 0.074125, + "grad_norm": 3.78125, + "grad_norm_var": 0.08531494140625, + "learning_rate": 0.0001, + "loss": 9.286, + "loss/crossentropy": 2.4476726055145264, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.3093183934688568, + "step": 1186 + }, + { + "epoch": 0.07425, + "grad_norm": 5.28125, + "grad_norm_var": 0.19589742024739584, + "learning_rate": 0.0001, + "loss": 9.0637, + "loss/crossentropy": 2.2181872725486755, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.32876770198345184, + "step": 1188 + }, + { + "epoch": 0.074375, + "grad_norm": 8.75, + "grad_norm_var": 1.5139719645182292, + "learning_rate": 0.0001, + "loss": 9.1797, + "loss/crossentropy": 2.1833176612854004, + "loss/hidden": 3.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.36219629645347595, + "step": 1190 + }, + { + "epoch": 0.0745, + "grad_norm": 3.578125, + "grad_norm_var": 1.5350494384765625, + "learning_rate": 0.0001, + "loss": 9.0629, + "loss/crossentropy": 2.405817151069641, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.31212493777275085, + "step": 1192 + }, + { + "epoch": 0.074625, + "grad_norm": 4.28125, + "grad_norm_var": 1.5252675374348958, + "learning_rate": 0.0001, + "loss": 9.0899, + "loss/crossentropy": 2.432392120361328, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.323747381567955, + "step": 1194 + }, + { + "epoch": 0.07475, + "grad_norm": 4.1875, + "grad_norm_var": 1.516307576497396, + "learning_rate": 0.0001, + "loss": 9.4802, + "loss/crossentropy": 2.60243022441864, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.3159172534942627, + "step": 1196 + }, + { + "epoch": 0.074875, + "grad_norm": 4.0625, + "grad_norm_var": 1.518024698893229, + "learning_rate": 0.0001, + "loss": 9.1813, + "loss/crossentropy": 2.2708317041397095, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.3217354714870453, + "step": 1198 + }, + { + "epoch": 0.075, + "grad_norm": 3.53125, + "grad_norm_var": 1.5860260009765625, + "learning_rate": 0.0001, + "loss": 8.8086, + "loss/crossentropy": 2.2433842420578003, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.2955727279186249, + "step": 1200 + }, + { + "epoch": 0.075125, + "grad_norm": 4.0625, + "grad_norm_var": 1.5884429931640625, + "learning_rate": 0.0001, + "loss": 8.9904, + "loss/crossentropy": 2.544836163520813, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3101474642753601, + "step": 1202 + }, + { + "epoch": 0.07525, + "grad_norm": 3.984375, + "grad_norm_var": 1.5440388997395833, + "learning_rate": 0.0001, + "loss": 9.4, + "loss/crossentropy": 2.546027898788452, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.3451322615146637, + "step": 1204 + }, + { + "epoch": 0.075375, + "grad_norm": 3.78125, + "grad_norm_var": 0.1521484375, + "learning_rate": 0.0001, + "loss": 9.0253, + "loss/crossentropy": 2.4012279510498047, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.290866881608963, + "step": 1206 + }, + { + "epoch": 0.0755, + "grad_norm": 5.15625, + "grad_norm_var": 18.180106608072915, + "learning_rate": 0.0001, + "loss": 10.0656, + "loss/crossentropy": 2.4804184436798096, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.31466802954673767, + "step": 1208 + }, + { + "epoch": 0.075625, + "grad_norm": 3.421875, + "grad_norm_var": 18.290657552083335, + "learning_rate": 0.0001, + "loss": 8.942, + "loss/crossentropy": 2.104141592979431, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.3016415685415268, + "step": 1210 + }, + { + "epoch": 0.07575, + "grad_norm": 3.53125, + "grad_norm_var": 18.478189086914064, + "learning_rate": 0.0001, + "loss": 8.7778, + "loss/crossentropy": 2.216665744781494, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.30427753925323486, + "step": 1212 + }, + { + "epoch": 0.075875, + "grad_norm": 3.59375, + "grad_norm_var": 18.572997029622396, + "learning_rate": 0.0001, + "loss": 8.9408, + "loss/crossentropy": 2.410394072532654, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.3141447752714157, + "step": 1214 + }, + { + "epoch": 0.076, + "grad_norm": 3.71875, + "grad_norm_var": 18.539623006184897, + "learning_rate": 0.0001, + "loss": 9.0702, + "loss/crossentropy": 2.2947787642478943, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3040497452020645, + "step": 1216 + }, + { + "epoch": 0.076125, + "grad_norm": 3.96875, + "grad_norm_var": 18.52271728515625, + "learning_rate": 0.0001, + "loss": 9.5222, + "loss/crossentropy": 2.658992886543274, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.33450958132743835, + "step": 1218 + }, + { + "epoch": 0.07625, + "grad_norm": 3.59375, + "grad_norm_var": 18.640262858072916, + "learning_rate": 0.0001, + "loss": 9.1006, + "loss/crossentropy": 2.382628321647644, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.3292655050754547, + "step": 1220 + }, + { + "epoch": 0.076375, + "grad_norm": 3.859375, + "grad_norm_var": 18.59713134765625, + "learning_rate": 0.0001, + "loss": 8.8435, + "loss/crossentropy": 1.980876863002777, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.2758803069591522, + "step": 1222 + }, + { + "epoch": 0.0765, + "grad_norm": 3.640625, + "grad_norm_var": 0.031038411458333335, + "learning_rate": 0.0001, + "loss": 8.8474, + "loss/crossentropy": 2.0616570711135864, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.28200867772102356, + "step": 1224 + }, + { + "epoch": 0.076625, + "grad_norm": 3.734375, + "grad_norm_var": 0.021564737955729166, + "learning_rate": 0.0001, + "loss": 9.1792, + "loss/crossentropy": 2.3715614080429077, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.31676220893859863, + "step": 1226 + }, + { + "epoch": 0.07675, + "grad_norm": 3.84375, + "grad_norm_var": 0.021971638997395834, + "learning_rate": 0.0001, + "loss": 9.0379, + "loss/crossentropy": 2.5357784032821655, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.31404489278793335, + "step": 1228 + }, + { + "epoch": 0.076875, + "grad_norm": 3.59375, + "grad_norm_var": 0.022419230143229166, + "learning_rate": 0.0001, + "loss": 8.6486, + "loss/crossentropy": 2.225548505783081, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.286647230386734, + "step": 1230 + }, + { + "epoch": 0.077, + "grad_norm": 3.578125, + "grad_norm_var": 0.025797526041666668, + "learning_rate": 0.0001, + "loss": 8.9957, + "loss/crossentropy": 2.3057247400283813, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.3172541856765747, + "step": 1232 + }, + { + "epoch": 0.077125, + "grad_norm": 3.578125, + "grad_norm_var": 0.023388671875, + "learning_rate": 0.0001, + "loss": 9.0791, + "loss/crossentropy": 2.290403127670288, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.31419865787029266, + "step": 1234 + }, + { + "epoch": 0.07725, + "grad_norm": 3.46875, + "grad_norm_var": 0.021305338541666666, + "learning_rate": 0.0001, + "loss": 8.9479, + "loss/crossentropy": 2.1275144815444946, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.31394386291503906, + "step": 1236 + }, + { + "epoch": 0.077375, + "grad_norm": 4.0625, + "grad_norm_var": 0.05321858723958333, + "learning_rate": 0.0001, + "loss": 8.9015, + "loss/crossentropy": 2.2454686164855957, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.2934340536594391, + "step": 1238 + }, + { + "epoch": 0.0775, + "grad_norm": 3.703125, + "grad_norm_var": 0.0609527587890625, + "learning_rate": 0.0001, + "loss": 8.856, + "loss/crossentropy": 2.2809821367263794, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.29852208495140076, + "step": 1240 + }, + { + "epoch": 0.077625, + "grad_norm": 3.640625, + "grad_norm_var": 0.062841796875, + "learning_rate": 0.0001, + "loss": 9.0546, + "loss/crossentropy": 2.2770636081695557, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.31510084867477417, + "step": 1242 + }, + { + "epoch": 0.07775, + "grad_norm": 3.671875, + "grad_norm_var": 0.06161702473958333, + "learning_rate": 0.0001, + "loss": 9.0968, + "loss/crossentropy": 2.2599165439605713, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.3091094493865967, + "step": 1244 + }, + { + "epoch": 0.077875, + "grad_norm": 3.703125, + "grad_norm_var": 0.05953369140625, + "learning_rate": 0.0001, + "loss": 8.9473, + "loss/crossentropy": 2.0840908885002136, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.29790809750556946, + "step": 1246 + }, + { + "epoch": 0.078, + "grad_norm": 3.703125, + "grad_norm_var": 0.10155843098958334, + "learning_rate": 0.0001, + "loss": 9.3225, + "loss/crossentropy": 2.345468759536743, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.34454044699668884, + "step": 1248 + }, + { + "epoch": 0.078125, + "grad_norm": 3.515625, + "grad_norm_var": 0.10816141764322916, + "learning_rate": 0.0001, + "loss": 9.1545, + "loss/crossentropy": 2.413212776184082, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.31984058022499084, + "step": 1250 + }, + { + "epoch": 0.07825, + "grad_norm": 3.65625, + "grad_norm_var": 0.110205078125, + "learning_rate": 0.0001, + "loss": 8.9687, + "loss/crossentropy": 2.4369957447052, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.3076692074537277, + "step": 1252 + }, + { + "epoch": 0.078375, + "grad_norm": 4.59375, + "grad_norm_var": 0.12951558430989582, + "learning_rate": 0.0001, + "loss": 8.9991, + "loss/crossentropy": 2.415233612060547, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.29537807404994965, + "step": 1254 + }, + { + "epoch": 0.0785, + "grad_norm": 7.8125, + "grad_norm_var": 1.1356770833333334, + "learning_rate": 0.0001, + "loss": 9.5869, + "loss/crossentropy": 2.3058598041534424, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.3160950839519501, + "step": 1256 + }, + { + "epoch": 0.078625, + "grad_norm": 3.484375, + "grad_norm_var": 1.149267578125, + "learning_rate": 0.0001, + "loss": 9.2005, + "loss/crossentropy": 2.412594437599182, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.31167350709438324, + "step": 1258 + }, + { + "epoch": 0.07875, + "grad_norm": 3.34375, + "grad_norm_var": 1.1874989827473958, + "learning_rate": 0.0001, + "loss": 8.9355, + "loss/crossentropy": 2.365793824195862, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.30335795879364014, + "step": 1260 + }, + { + "epoch": 0.078875, + "grad_norm": 6.6875, + "grad_norm_var": 1.6226236979166666, + "learning_rate": 0.0001, + "loss": 9.1613, + "loss/crossentropy": 2.261883854866028, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.3325384855270386, + "step": 1262 + }, + { + "epoch": 0.079, + "grad_norm": 3.828125, + "grad_norm_var": 1.6119099934895833, + "learning_rate": 0.0001, + "loss": 8.9772, + "loss/crossentropy": 2.37065851688385, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.3135601878166199, + "step": 1264 + }, + { + "epoch": 0.079125, + "grad_norm": 5.0625, + "grad_norm_var": 1.6016103108723958, + "learning_rate": 0.0001, + "loss": 8.8108, + "loss/crossentropy": 2.330946683883667, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.28366419672966003, + "step": 1266 + }, + { + "epoch": 0.07925, + "grad_norm": 4.125, + "grad_norm_var": 1.5486399332682292, + "learning_rate": 0.0001, + "loss": 9.0762, + "loss/crossentropy": 2.5277793407440186, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.317441463470459, + "step": 1268 + }, + { + "epoch": 0.079375, + "grad_norm": 4.09375, + "grad_norm_var": 1.5559234619140625, + "learning_rate": 0.0001, + "loss": 8.8627, + "loss/crossentropy": 2.0369693636894226, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.2870900481939316, + "step": 1270 + }, + { + "epoch": 0.0795, + "grad_norm": 3.671875, + "grad_norm_var": 0.6730794270833333, + "learning_rate": 0.0001, + "loss": 9.2479, + "loss/crossentropy": 2.266517758369446, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.3010869026184082, + "step": 1272 + }, + { + "epoch": 0.079625, + "grad_norm": 4.84375, + "grad_norm_var": 0.8135162353515625, + "learning_rate": 0.0001, + "loss": 9.2829, + "loss/crossentropy": 2.4318493604660034, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.3534676879644394, + "step": 1274 + }, + { + "epoch": 0.07975, + "grad_norm": 3.75, + "grad_norm_var": 0.7549112955729167, + "learning_rate": 0.0001, + "loss": 9.2051, + "loss/crossentropy": 2.665824294090271, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.3084219694137573, + "step": 1276 + }, + { + "epoch": 0.079875, + "grad_norm": 4.09375, + "grad_norm_var": 0.3110015869140625, + "learning_rate": 0.0001, + "loss": 9.3341, + "loss/crossentropy": 2.585180401802063, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.3485944867134094, + "step": 1278 + }, + { + "epoch": 0.08, + "grad_norm": 3.890625, + "grad_norm_var": 0.30613606770833335, + "learning_rate": 0.0001, + "loss": 9.0663, + "loss/crossentropy": 2.235915422439575, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.3006708025932312, + "step": 1280 + }, + { + "epoch": 0.080125, + "grad_norm": 3.5625, + "grad_norm_var": 0.28245442708333335, + "learning_rate": 0.0001, + "loss": 8.9244, + "loss/crossentropy": 2.196950912475586, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.30859434604644775, + "step": 1282 + }, + { + "epoch": 0.08025, + "grad_norm": 3.84375, + "grad_norm_var": 0.27616780598958335, + "learning_rate": 0.0001, + "loss": 8.9921, + "loss/crossentropy": 2.3487383127212524, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.31605543196201324, + "step": 1284 + }, + { + "epoch": 0.080375, + "grad_norm": 3.625, + "grad_norm_var": 0.27758687337239585, + "learning_rate": 0.0001, + "loss": 9.0989, + "loss/crossentropy": 2.2177536487579346, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.2946600914001465, + "step": 1286 + }, + { + "epoch": 0.0805, + "grad_norm": 3.515625, + "grad_norm_var": 0.28926493326822916, + "learning_rate": 0.0001, + "loss": 8.8901, + "loss/crossentropy": 2.1754029989242554, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.3095686435699463, + "step": 1288 + }, + { + "epoch": 0.080625, + "grad_norm": 3.984375, + "grad_norm_var": 0.04488932291666667, + "learning_rate": 0.0001, + "loss": 9.0607, + "loss/crossentropy": 2.392609477043152, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.2938263714313507, + "step": 1290 + }, + { + "epoch": 0.08075, + "grad_norm": 4.1875, + "grad_norm_var": 0.3003326416015625, + "learning_rate": 0.0001, + "loss": 9.5169, + "loss/crossentropy": 2.4202769994735718, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.3104136437177658, + "step": 1292 + }, + { + "epoch": 0.080875, + "grad_norm": 3.640625, + "grad_norm_var": 0.29915262858072916, + "learning_rate": 0.0001, + "loss": 8.8761, + "loss/crossentropy": 2.2731767892837524, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.30449719727039337, + "step": 1294 + }, + { + "epoch": 0.081, + "grad_norm": 3.59375, + "grad_norm_var": 0.30543619791666665, + "learning_rate": 0.0001, + "loss": 9.0399, + "loss/crossentropy": 2.470086932182312, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.28193579614162445, + "step": 1296 + }, + { + "epoch": 0.081125, + "grad_norm": 3.40625, + "grad_norm_var": 0.2979644775390625, + "learning_rate": 0.0001, + "loss": 9.0157, + "loss/crossentropy": 2.167941153049469, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.29203882813453674, + "step": 1298 + }, + { + "epoch": 0.08125, + "grad_norm": 3.734375, + "grad_norm_var": 0.2990193684895833, + "learning_rate": 0.0001, + "loss": 9.1285, + "loss/crossentropy": 2.2209893465042114, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.309230774641037, + "step": 1300 + }, + { + "epoch": 0.081375, + "grad_norm": 3.671875, + "grad_norm_var": 0.2967274983723958, + "learning_rate": 0.0001, + "loss": 9.0471, + "loss/crossentropy": 2.2622058391571045, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.32858118414878845, + "step": 1302 + }, + { + "epoch": 0.0815, + "grad_norm": 3.984375, + "grad_norm_var": 0.2907786051432292, + "learning_rate": 0.0001, + "loss": 9.001, + "loss/crossentropy": 2.550824522972107, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.30813509225845337, + "step": 1304 + }, + { + "epoch": 0.081625, + "grad_norm": 3.9375, + "grad_norm_var": 0.28088785807291666, + "learning_rate": 0.0001, + "loss": 9.1413, + "loss/crossentropy": 2.234109878540039, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.3101739436388016, + "step": 1306 + }, + { + "epoch": 0.08175, + "grad_norm": 3.625, + "grad_norm_var": 0.0467681884765625, + "learning_rate": 0.0001, + "loss": 9.1138, + "loss/crossentropy": 2.6315842866897583, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.3196646571159363, + "step": 1308 + }, + { + "epoch": 0.081875, + "grad_norm": 3.453125, + "grad_norm_var": 0.052586873372395836, + "learning_rate": 0.0001, + "loss": 8.8418, + "loss/crossentropy": 2.262266516685486, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.29676589369773865, + "step": 1310 + }, + { + "epoch": 0.082, + "grad_norm": 3.703125, + "grad_norm_var": 0.049702962239583336, + "learning_rate": 0.0001, + "loss": 9.0204, + "loss/crossentropy": 2.2959643602371216, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.3009609580039978, + "step": 1312 + }, + { + "epoch": 0.082125, + "grad_norm": 3.46875, + "grad_norm_var": 0.03916727701822917, + "learning_rate": 0.0001, + "loss": 9.0814, + "loss/crossentropy": 2.8231089115142822, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.320039302110672, + "step": 1314 + }, + { + "epoch": 0.08225, + "grad_norm": 3.515625, + "grad_norm_var": 0.043843587239583336, + "learning_rate": 0.0001, + "loss": 9.0418, + "loss/crossentropy": 2.6723110675811768, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.3218715190887451, + "step": 1316 + }, + { + "epoch": 0.082375, + "grad_norm": 3.921875, + "grad_norm_var": 0.04127197265625, + "learning_rate": 0.0001, + "loss": 9.1254, + "loss/crossentropy": 2.817944288253784, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.3088568150997162, + "step": 1318 + }, + { + "epoch": 0.0825, + "grad_norm": 3.25, + "grad_norm_var": 0.0599029541015625, + "learning_rate": 0.0001, + "loss": 8.9, + "loss/crossentropy": 2.3980846405029297, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2999623566865921, + "step": 1320 + }, + { + "epoch": 0.082625, + "grad_norm": 3.84375, + "grad_norm_var": 0.0474273681640625, + "learning_rate": 0.0001, + "loss": 9.3528, + "loss/crossentropy": 2.595438838005066, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.3496478945016861, + "step": 1322 + }, + { + "epoch": 0.08275, + "grad_norm": 3.734375, + "grad_norm_var": 0.0486724853515625, + "learning_rate": 0.0001, + "loss": 8.8593, + "loss/crossentropy": 2.2254860401153564, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.2829415947198868, + "step": 1324 + }, + { + "epoch": 0.082875, + "grad_norm": 3.484375, + "grad_norm_var": 0.049723307291666664, + "learning_rate": 0.0001, + "loss": 8.9027, + "loss/crossentropy": 2.44465708732605, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.32694484293460846, + "step": 1326 + }, + { + "epoch": 0.083, + "grad_norm": 3.734375, + "grad_norm_var": 0.053544108072916666, + "learning_rate": 0.0001, + "loss": 9.1538, + "loss/crossentropy": 2.313889980316162, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.33095018565654755, + "step": 1328 + }, + { + "epoch": 0.083125, + "grad_norm": 3.90625, + "grad_norm_var": 0.049332682291666666, + "learning_rate": 0.0001, + "loss": 9.1456, + "loss/crossentropy": 2.4496175050735474, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.30967962741851807, + "step": 1330 + }, + { + "epoch": 0.08325, + "grad_norm": 3.5625, + "grad_norm_var": 0.04842020670572917, + "learning_rate": 0.0001, + "loss": 9.0947, + "loss/crossentropy": 2.386078953742981, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.3234080672264099, + "step": 1332 + }, + { + "epoch": 0.083375, + "grad_norm": 3.515625, + "grad_norm_var": 0.04586181640625, + "learning_rate": 0.0001, + "loss": 8.7408, + "loss/crossentropy": 2.339785575866699, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.32101643085479736, + "step": 1334 + }, + { + "epoch": 0.0835, + "grad_norm": 4.03125, + "grad_norm_var": 0.029230753580729168, + "learning_rate": 0.0001, + "loss": 8.8596, + "loss/crossentropy": 2.6105300188064575, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.31350916624069214, + "step": 1336 + }, + { + "epoch": 0.083625, + "grad_norm": 3.75, + "grad_norm_var": 0.028864542643229168, + "learning_rate": 0.0001, + "loss": 8.63, + "loss/crossentropy": 2.0749881863594055, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.28848570585250854, + "step": 1338 + }, + { + "epoch": 0.08375, + "grad_norm": 3.59375, + "grad_norm_var": 0.027977498372395833, + "learning_rate": 0.0001, + "loss": 9.1618, + "loss/crossentropy": 2.243058681488037, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.2843780219554901, + "step": 1340 + }, + { + "epoch": 0.083875, + "grad_norm": 3.3125, + "grad_norm_var": 0.042536417643229164, + "learning_rate": 0.0001, + "loss": 8.6702, + "loss/crossentropy": 2.2894665002822876, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.3040817379951477, + "step": 1342 + }, + { + "epoch": 0.084, + "grad_norm": 3.53125, + "grad_norm_var": 0.056151326497395834, + "learning_rate": 0.0001, + "loss": 9.1365, + "loss/crossentropy": 2.210882782936096, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.32323381304740906, + "step": 1344 + }, + { + "epoch": 0.084125, + "grad_norm": 4.125, + "grad_norm_var": 0.08358968098958333, + "learning_rate": 0.0001, + "loss": 9.1776, + "loss/crossentropy": 2.342850089073181, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.2916054427623749, + "step": 1346 + }, + { + "epoch": 0.08425, + "grad_norm": 3.671875, + "grad_norm_var": 0.08816731770833333, + "learning_rate": 0.0001, + "loss": 9.2657, + "loss/crossentropy": 2.44538152217865, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.3160920739173889, + "step": 1348 + }, + { + "epoch": 0.084375, + "grad_norm": 3.8125, + "grad_norm_var": 0.08382059733072916, + "learning_rate": 0.0001, + "loss": 9.0063, + "loss/crossentropy": 2.5489304065704346, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.3081457316875458, + "step": 1350 + }, + { + "epoch": 0.0845, + "grad_norm": 3.59375, + "grad_norm_var": 0.0790191650390625, + "learning_rate": 0.0001, + "loss": 8.9541, + "loss/crossentropy": 2.180498778820038, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.3018496334552765, + "step": 1352 + }, + { + "epoch": 0.084625, + "grad_norm": 3.515625, + "grad_norm_var": 0.080224609375, + "learning_rate": 0.0001, + "loss": 9.082, + "loss/crossentropy": 2.384360671043396, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.28553615510463715, + "step": 1354 + }, + { + "epoch": 0.08475, + "grad_norm": 3.6875, + "grad_norm_var": 0.0791656494140625, + "learning_rate": 0.0001, + "loss": 9.0263, + "loss/crossentropy": 2.6955255270004272, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.3320935517549515, + "step": 1356 + }, + { + "epoch": 0.084875, + "grad_norm": 3.375, + "grad_norm_var": 0.05998942057291667, + "learning_rate": 0.0001, + "loss": 8.5942, + "loss/crossentropy": 2.084562659263611, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.2838726341724396, + "step": 1358 + }, + { + "epoch": 0.085, + "grad_norm": 3.796875, + "grad_norm_var": 0.046647135416666666, + "learning_rate": 0.0001, + "loss": 8.8832, + "loss/crossentropy": 2.3465107679367065, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.3064923733472824, + "step": 1360 + }, + { + "epoch": 0.085125, + "grad_norm": 3.828125, + "grad_norm_var": 0.025153605143229167, + "learning_rate": 0.0001, + "loss": 9.089, + "loss/crossentropy": 2.3062328100204468, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.3037351071834564, + "step": 1362 + }, + { + "epoch": 0.08525, + "grad_norm": 3.96875, + "grad_norm_var": 0.031148274739583332, + "learning_rate": 0.0001, + "loss": 8.9159, + "loss/crossentropy": 2.109978973865509, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.3013267517089844, + "step": 1364 + }, + { + "epoch": 0.085375, + "grad_norm": 3.59375, + "grad_norm_var": 0.05119527180989583, + "learning_rate": 0.0001, + "loss": 8.9955, + "loss/crossentropy": 2.1400793194770813, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.2959713786840439, + "step": 1366 + }, + { + "epoch": 0.0855, + "grad_norm": 3.609375, + "grad_norm_var": 0.051985677083333334, + "learning_rate": 0.0001, + "loss": 9.0828, + "loss/crossentropy": 2.583168387413025, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.3330073952674866, + "step": 1368 + }, + { + "epoch": 0.085625, + "grad_norm": 3.859375, + "grad_norm_var": 0.07195638020833334, + "learning_rate": 0.0001, + "loss": 8.9546, + "loss/crossentropy": 2.3753483295440674, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.2922537922859192, + "step": 1370 + }, + { + "epoch": 0.08575, + "grad_norm": 3.6875, + "grad_norm_var": 0.07174072265625, + "learning_rate": 0.0001, + "loss": 8.9679, + "loss/crossentropy": 2.4578174352645874, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.313604399561882, + "step": 1372 + }, + { + "epoch": 0.085875, + "grad_norm": 4.8125, + "grad_norm_var": 0.13062744140625, + "learning_rate": 0.0001, + "loss": 8.83, + "loss/crossentropy": 2.1847041845321655, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.29400117695331573, + "step": 1374 + }, + { + "epoch": 0.086, + "grad_norm": 3.25, + "grad_norm_var": 0.1586822509765625, + "learning_rate": 0.0001, + "loss": 8.8043, + "loss/crossentropy": 2.2943379878997803, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.31389278173446655, + "step": 1376 + }, + { + "epoch": 0.086125, + "grad_norm": 3.65625, + "grad_norm_var": 0.16523030598958333, + "learning_rate": 0.0001, + "loss": 9.0582, + "loss/crossentropy": 2.594352960586548, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.32820238173007965, + "step": 1378 + }, + { + "epoch": 0.08625, + "grad_norm": 3.671875, + "grad_norm_var": 0.15104878743489583, + "learning_rate": 0.0001, + "loss": 9.3228, + "loss/crossentropy": 2.571452260017395, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.3398682475090027, + "step": 1380 + }, + { + "epoch": 0.086375, + "grad_norm": 3.5625, + "grad_norm_var": 0.13977457682291666, + "learning_rate": 0.0001, + "loss": 8.9274, + "loss/crossentropy": 2.431585431098938, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.2900923192501068, + "step": 1382 + }, + { + "epoch": 0.0865, + "grad_norm": 3.640625, + "grad_norm_var": 0.14218343098958333, + "learning_rate": 0.0001, + "loss": 8.9046, + "loss/crossentropy": 2.4001163244247437, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.3200056552886963, + "step": 1384 + }, + { + "epoch": 0.086625, + "grad_norm": 3.53125, + "grad_norm_var": 0.12639058430989583, + "learning_rate": 0.0001, + "loss": 8.7451, + "loss/crossentropy": 2.212107300758362, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.29481518268585205, + "step": 1386 + }, + { + "epoch": 0.08675, + "grad_norm": 4.53125, + "grad_norm_var": 0.17360738118489583, + "learning_rate": 0.0001, + "loss": 8.9308, + "loss/crossentropy": 2.1325159072875977, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.3161931484937668, + "step": 1388 + }, + { + "epoch": 0.086875, + "grad_norm": 3.84375, + "grad_norm_var": 0.08817952473958333, + "learning_rate": 0.0001, + "loss": 8.9893, + "loss/crossentropy": 2.7963234186172485, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.3157733827829361, + "step": 1390 + }, + { + "epoch": 0.087, + "grad_norm": 3.578125, + "grad_norm_var": 0.07779541015625, + "learning_rate": 0.0001, + "loss": 8.7993, + "loss/crossentropy": 2.404562830924988, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.3078538328409195, + "step": 1392 + }, + { + "epoch": 0.087125, + "grad_norm": 3.390625, + "grad_norm_var": 0.0818359375, + "learning_rate": 0.0001, + "loss": 8.9106, + "loss/crossentropy": 2.411270022392273, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.3172430843114853, + "step": 1394 + }, + { + "epoch": 0.08725, + "grad_norm": 3.859375, + "grad_norm_var": 0.1228515625, + "learning_rate": 0.0001, + "loss": 9.1999, + "loss/crossentropy": 2.290405511856079, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2864595949649811, + "step": 1396 + }, + { + "epoch": 0.087375, + "grad_norm": 3.5625, + "grad_norm_var": 0.12275288899739584, + "learning_rate": 0.0001, + "loss": 9.0627, + "loss/crossentropy": 2.3292382955551147, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.28290052711963654, + "step": 1398 + }, + { + "epoch": 0.0875, + "grad_norm": 3.578125, + "grad_norm_var": 0.12724202473958332, + "learning_rate": 0.0001, + "loss": 8.6953, + "loss/crossentropy": 2.0091291666030884, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2793958783149719, + "step": 1400 + }, + { + "epoch": 0.087625, + "grad_norm": 4.21875, + "grad_norm_var": 0.13843994140625, + "learning_rate": 0.0001, + "loss": 8.9071, + "loss/crossentropy": 2.6832462549209595, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.3183281272649765, + "step": 1402 + }, + { + "epoch": 0.08775, + "grad_norm": 3.3125, + "grad_norm_var": 0.10543212890625, + "learning_rate": 0.0001, + "loss": 8.9968, + "loss/crossentropy": 2.267482042312622, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.3442094475030899, + "step": 1404 + }, + { + "epoch": 0.087875, + "grad_norm": 3.46875, + "grad_norm_var": 0.10003255208333334, + "learning_rate": 0.0001, + "loss": 8.8766, + "loss/crossentropy": 2.3723970651626587, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.293814942240715, + "step": 1406 + }, + { + "epoch": 0.088, + "grad_norm": 3.75, + "grad_norm_var": 0.10246988932291666, + "learning_rate": 0.0001, + "loss": 8.8107, + "loss/crossentropy": 2.3082317113876343, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.3221370577812195, + "step": 1408 + }, + { + "epoch": 0.088125, + "grad_norm": 3.765625, + "grad_norm_var": 0.09614969889322916, + "learning_rate": 0.0001, + "loss": 8.956, + "loss/crossentropy": 2.4758397340774536, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.33573949337005615, + "step": 1410 + }, + { + "epoch": 0.08825, + "grad_norm": 3.578125, + "grad_norm_var": 0.05336812337239583, + "learning_rate": 0.0001, + "loss": 8.5994, + "loss/crossentropy": 2.408522605895996, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.29929833114147186, + "step": 1412 + }, + { + "epoch": 0.088375, + "grad_norm": 3.640625, + "grad_norm_var": 0.05576883951822917, + "learning_rate": 0.0001, + "loss": 9.2281, + "loss/crossentropy": 2.563318610191345, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.32796354591846466, + "step": 1414 + }, + { + "epoch": 0.0885, + "grad_norm": 3.4375, + "grad_norm_var": 0.05320536295572917, + "learning_rate": 0.0001, + "loss": 9.0656, + "loss/crossentropy": 2.5199949741363525, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.29131297767162323, + "step": 1416 + }, + { + "epoch": 0.088625, + "grad_norm": 3.796875, + "grad_norm_var": 0.031525675455729166, + "learning_rate": 0.0001, + "loss": 8.8611, + "loss/crossentropy": 2.536887049674988, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.3120550215244293, + "step": 1418 + }, + { + "epoch": 0.08875, + "grad_norm": 3.4375, + "grad_norm_var": 0.0244293212890625, + "learning_rate": 0.0001, + "loss": 8.9123, + "loss/crossentropy": 2.215959906578064, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.2943772077560425, + "step": 1420 + }, + { + "epoch": 0.088875, + "grad_norm": 3.546875, + "grad_norm_var": 0.023173014322916668, + "learning_rate": 0.0001, + "loss": 8.9674, + "loss/crossentropy": 2.1649523973464966, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.3028850704431534, + "step": 1422 + }, + { + "epoch": 0.089, + "grad_norm": 3.328125, + "grad_norm_var": 0.07156575520833333, + "learning_rate": 0.0001, + "loss": 8.9571, + "loss/crossentropy": 2.379367709159851, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.32595328986644745, + "step": 1424 + }, + { + "epoch": 0.089125, + "grad_norm": 3.796875, + "grad_norm_var": 0.07255859375, + "learning_rate": 0.0001, + "loss": 9.0234, + "loss/crossentropy": 2.441414713859558, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3373495191335678, + "step": 1426 + }, + { + "epoch": 0.08925, + "grad_norm": 3.8125, + "grad_norm_var": 0.07247721354166667, + "learning_rate": 0.0001, + "loss": 9.2807, + "loss/crossentropy": 2.720638632774353, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.3340994864702225, + "step": 1428 + }, + { + "epoch": 0.089375, + "grad_norm": 3.984375, + "grad_norm_var": 0.07625325520833333, + "learning_rate": 0.0001, + "loss": 8.9398, + "loss/crossentropy": 2.171198010444641, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.3092898577451706, + "step": 1430 + }, + { + "epoch": 0.0895, + "grad_norm": 3.59375, + "grad_norm_var": 0.07291259765625, + "learning_rate": 0.0001, + "loss": 9.0595, + "loss/crossentropy": 2.5796386003494263, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.33887575566768646, + "step": 1432 + }, + { + "epoch": 0.089625, + "grad_norm": 3.5, + "grad_norm_var": 0.07304280598958333, + "learning_rate": 0.0001, + "loss": 8.7837, + "loss/crossentropy": 2.264691114425659, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2661140263080597, + "step": 1434 + }, + { + "epoch": 0.08975, + "grad_norm": 4.09375, + "grad_norm_var": 0.08142903645833334, + "learning_rate": 0.0001, + "loss": 8.9487, + "loss/crossentropy": 2.474991798400879, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.308430016040802, + "step": 1436 + }, + { + "epoch": 0.089875, + "grad_norm": 3.8125, + "grad_norm_var": 0.07261962890625, + "learning_rate": 0.0001, + "loss": 8.9615, + "loss/crossentropy": 2.464845299720764, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.29659199714660645, + "step": 1438 + }, + { + "epoch": 0.09, + "grad_norm": 3.421875, + "grad_norm_var": 0.034989420572916666, + "learning_rate": 0.0001, + "loss": 8.7208, + "loss/crossentropy": 2.349491000175476, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.31345631182193756, + "step": 1440 + }, + { + "epoch": 0.090125, + "grad_norm": 3.578125, + "grad_norm_var": 0.063623046875, + "learning_rate": 0.0001, + "loss": 9.1662, + "loss/crossentropy": 2.5956228971481323, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.30429892241954803, + "step": 1442 + }, + { + "epoch": 0.09025, + "grad_norm": 3.71875, + "grad_norm_var": 0.06392822265625, + "learning_rate": 0.0001, + "loss": 8.7371, + "loss/crossentropy": 2.3084046840667725, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.2794123440980911, + "step": 1444 + }, + { + "epoch": 0.090375, + "grad_norm": 4.21875, + "grad_norm_var": 0.08909098307291667, + "learning_rate": 0.0001, + "loss": 8.6431, + "loss/crossentropy": 2.0833881497383118, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2675230801105499, + "step": 1446 + }, + { + "epoch": 0.0905, + "grad_norm": 3.640625, + "grad_norm_var": 0.08801676432291666, + "learning_rate": 0.0001, + "loss": 9.0263, + "loss/crossentropy": 2.3375617265701294, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3266754746437073, + "step": 1448 + }, + { + "epoch": 0.090625, + "grad_norm": 4.9375, + "grad_norm_var": 0.2819295247395833, + "learning_rate": 0.0001, + "loss": 9.0776, + "loss/crossentropy": 2.259764075279236, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.29133765399456024, + "step": 1450 + }, + { + "epoch": 0.09075, + "grad_norm": 3.0625, + "grad_norm_var": 0.3140777587890625, + "learning_rate": 0.0001, + "loss": 8.755, + "loss/crossentropy": 2.4632983207702637, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.30049507319927216, + "step": 1452 + }, + { + "epoch": 0.090875, + "grad_norm": 3.640625, + "grad_norm_var": 0.3232086181640625, + "learning_rate": 0.0001, + "loss": 8.8504, + "loss/crossentropy": 2.3947253227233887, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.2989191859960556, + "step": 1454 + }, + { + "epoch": 0.091, + "grad_norm": 3.59375, + "grad_norm_var": 0.3135579427083333, + "learning_rate": 0.0001, + "loss": 8.7741, + "loss/crossentropy": 2.366453766822815, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2914280444383621, + "step": 1456 + }, + { + "epoch": 0.091125, + "grad_norm": 3.28125, + "grad_norm_var": 0.32301432291666665, + "learning_rate": 0.0001, + "loss": 8.6575, + "loss/crossentropy": 2.1803172826766968, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.3163032829761505, + "step": 1458 + }, + { + "epoch": 0.09125, + "grad_norm": 3.796875, + "grad_norm_var": 0.32535400390625, + "learning_rate": 0.0001, + "loss": 9.0737, + "loss/crossentropy": 2.459627389907837, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.31055358052253723, + "step": 1460 + }, + { + "epoch": 0.091375, + "grad_norm": 3.5, + "grad_norm_var": 0.31604410807291666, + "learning_rate": 0.0001, + "loss": 8.8823, + "loss/crossentropy": 2.204772710800171, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.27741169929504395, + "step": 1462 + }, + { + "epoch": 0.0915, + "grad_norm": 3.453125, + "grad_norm_var": 0.33539937337239584, + "learning_rate": 0.0001, + "loss": 8.8218, + "loss/crossentropy": 2.3198187351226807, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.3104029595851898, + "step": 1464 + }, + { + "epoch": 0.091625, + "grad_norm": 3.515625, + "grad_norm_var": 0.08391520182291666, + "learning_rate": 0.0001, + "loss": 8.7448, + "loss/crossentropy": 2.183798313140869, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.31394851207733154, + "step": 1466 + }, + { + "epoch": 0.09175, + "grad_norm": 3.6875, + "grad_norm_var": 0.07111714680989584, + "learning_rate": 0.0001, + "loss": 9.1398, + "loss/crossentropy": 2.5468273162841797, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.3174278736114502, + "step": 1468 + }, + { + "epoch": 0.091875, + "grad_norm": 3.421875, + "grad_norm_var": 0.07214253743489583, + "learning_rate": 0.0001, + "loss": 8.887, + "loss/crossentropy": 2.617543339729309, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.28839461505413055, + "step": 1470 + }, + { + "epoch": 0.092, + "grad_norm": 3.640625, + "grad_norm_var": 0.05963134765625, + "learning_rate": 0.0001, + "loss": 8.6417, + "loss/crossentropy": 2.598427653312683, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.32050472497940063, + "step": 1472 + }, + { + "epoch": 0.092125, + "grad_norm": 3.625, + "grad_norm_var": 0.047883097330729166, + "learning_rate": 0.0001, + "loss": 9.0224, + "loss/crossentropy": 2.545255661010742, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.30465172231197357, + "step": 1474 + }, + { + "epoch": 0.09225, + "grad_norm": 3.640625, + "grad_norm_var": 0.049267578125, + "learning_rate": 0.0001, + "loss": 8.8417, + "loss/crossentropy": 2.4060131311416626, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.286019504070282, + "step": 1476 + }, + { + "epoch": 0.092375, + "grad_norm": 3.6875, + "grad_norm_var": 0.022591145833333333, + "learning_rate": 0.0001, + "loss": 8.7607, + "loss/crossentropy": 2.196686089038849, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2895347326993942, + "step": 1478 + }, + { + "epoch": 0.0925, + "grad_norm": 3.484375, + "grad_norm_var": 0.03193257649739583, + "learning_rate": 0.0001, + "loss": 8.9549, + "loss/crossentropy": 2.2161173820495605, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.3376694321632385, + "step": 1480 + }, + { + "epoch": 0.092625, + "grad_norm": 3.8125, + "grad_norm_var": 0.031916300455729164, + "learning_rate": 0.0001, + "loss": 8.9692, + "loss/crossentropy": 2.4159319400787354, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.33057647943496704, + "step": 1482 + }, + { + "epoch": 0.09275, + "grad_norm": 3.328125, + "grad_norm_var": 0.036844889322916664, + "learning_rate": 0.0001, + "loss": 8.7113, + "loss/crossentropy": 2.301741361618042, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.31546755135059357, + "step": 1484 + }, + { + "epoch": 0.092875, + "grad_norm": 3.515625, + "grad_norm_var": 0.0347076416015625, + "learning_rate": 0.0001, + "loss": 8.9267, + "loss/crossentropy": 2.2651939392089844, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.30455446243286133, + "step": 1486 + }, + { + "epoch": 0.093, + "grad_norm": 3.75, + "grad_norm_var": 0.0397613525390625, + "learning_rate": 0.0001, + "loss": 9.0545, + "loss/crossentropy": 2.5079206228256226, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.3329392969608307, + "step": 1488 + }, + { + "epoch": 0.093125, + "grad_norm": 3.625, + "grad_norm_var": 0.04644775390625, + "learning_rate": 0.0001, + "loss": 9.1031, + "loss/crossentropy": 2.3883864879608154, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.31360116600990295, + "step": 1490 + }, + { + "epoch": 0.09325, + "grad_norm": 3.96875, + "grad_norm_var": 0.04755859375, + "learning_rate": 0.0001, + "loss": 9.1668, + "loss/crossentropy": 2.251497983932495, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.32022619247436523, + "step": 1492 + }, + { + "epoch": 0.093375, + "grad_norm": 3.609375, + "grad_norm_var": 0.0482818603515625, + "learning_rate": 0.0001, + "loss": 8.7119, + "loss/crossentropy": 2.223703145980835, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2985079288482666, + "step": 1494 + }, + { + "epoch": 0.0935, + "grad_norm": 3.390625, + "grad_norm_var": 0.04791259765625, + "learning_rate": 0.0001, + "loss": 8.7767, + "loss/crossentropy": 2.3311924934387207, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.29468269646167755, + "step": 1496 + }, + { + "epoch": 0.093625, + "grad_norm": 3.65625, + "grad_norm_var": 0.05221354166666667, + "learning_rate": 0.0001, + "loss": 8.7455, + "loss/crossentropy": 2.2993088960647583, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.31247611343860626, + "step": 1498 + }, + { + "epoch": 0.09375, + "grad_norm": 3.34375, + "grad_norm_var": 0.053376261393229166, + "learning_rate": 0.0001, + "loss": 8.7313, + "loss/crossentropy": 2.2700235843658447, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.2984912842512131, + "step": 1500 + }, + { + "epoch": 0.093875, + "grad_norm": 4.84375, + "grad_norm_var": 0.15165608723958332, + "learning_rate": 0.0001, + "loss": 8.8359, + "loss/crossentropy": 2.365916609764099, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.28694379329681396, + "step": 1502 + }, + { + "epoch": 0.094, + "grad_norm": 4.09375, + "grad_norm_var": 0.15507710774739583, + "learning_rate": 0.0001, + "loss": 8.9381, + "loss/crossentropy": 2.357543706893921, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.3162877708673477, + "step": 1504 + }, + { + "epoch": 0.094125, + "grad_norm": 3.421875, + "grad_norm_var": 0.16050516764322917, + "learning_rate": 0.0001, + "loss": 8.6509, + "loss/crossentropy": 2.235751748085022, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.31035009026527405, + "step": 1506 + }, + { + "epoch": 0.09425, + "grad_norm": 3.84375, + "grad_norm_var": 0.15790913899739584, + "learning_rate": 0.0001, + "loss": 8.9785, + "loss/crossentropy": 2.421581745147705, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.30988195538520813, + "step": 1508 + }, + { + "epoch": 0.094375, + "grad_norm": 3.625, + "grad_norm_var": 0.1599761962890625, + "learning_rate": 0.0001, + "loss": 8.9374, + "loss/crossentropy": 2.314875602722168, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.2935473322868347, + "step": 1510 + }, + { + "epoch": 0.0945, + "grad_norm": 4.0, + "grad_norm_var": 0.14195556640625, + "learning_rate": 0.0001, + "loss": 9.2329, + "loss/crossentropy": 2.3859556913375854, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.34312979876995087, + "step": 1512 + }, + { + "epoch": 0.094625, + "grad_norm": 3.640625, + "grad_norm_var": 0.135791015625, + "learning_rate": 0.0001, + "loss": 8.8305, + "loss/crossentropy": 2.4759390354156494, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.30718210339546204, + "step": 1514 + }, + { + "epoch": 0.09475, + "grad_norm": 3.4375, + "grad_norm_var": 0.13124593098958334, + "learning_rate": 0.0001, + "loss": 8.6304, + "loss/crossentropy": 2.4228183031082153, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.2753960192203522, + "step": 1516 + }, + { + "epoch": 0.094875, + "grad_norm": 3.4375, + "grad_norm_var": 0.04909566243489583, + "learning_rate": 0.0001, + "loss": 8.9843, + "loss/crossentropy": 2.4210203886032104, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.30042560398578644, + "step": 1518 + }, + { + "epoch": 0.095, + "grad_norm": 4.03125, + "grad_norm_var": 0.04656575520833333, + "learning_rate": 0.0001, + "loss": 9.012, + "loss/crossentropy": 2.45758593082428, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.29631373286247253, + "step": 1520 + }, + { + "epoch": 0.095125, + "grad_norm": 3.265625, + "grad_norm_var": 0.049576822916666666, + "learning_rate": 0.0001, + "loss": 8.5905, + "loss/crossentropy": 2.4333043098449707, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.2933865636587143, + "step": 1522 + }, + { + "epoch": 0.09525, + "grad_norm": 3.71875, + "grad_norm_var": 0.04767252604166667, + "learning_rate": 0.0001, + "loss": 9.0024, + "loss/crossentropy": 2.32417368888855, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.3141182065010071, + "step": 1524 + }, + { + "epoch": 0.095375, + "grad_norm": 3.625, + "grad_norm_var": 0.0506988525390625, + "learning_rate": 0.0001, + "loss": 9.1668, + "loss/crossentropy": 2.473471999168396, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.3085293173789978, + "step": 1526 + }, + { + "epoch": 0.0955, + "grad_norm": 3.625, + "grad_norm_var": 0.0439605712890625, + "learning_rate": 0.0001, + "loss": 8.8465, + "loss/crossentropy": 2.4285165071487427, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.2852857708930969, + "step": 1528 + }, + { + "epoch": 0.095625, + "grad_norm": 3.421875, + "grad_norm_var": 0.0430328369140625, + "learning_rate": 0.0001, + "loss": 8.6667, + "loss/crossentropy": 2.263743579387665, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2716449797153473, + "step": 1530 + }, + { + "epoch": 0.09575, + "grad_norm": 3.421875, + "grad_norm_var": 0.045979817708333336, + "learning_rate": 0.0001, + "loss": 8.8735, + "loss/crossentropy": 2.331640362739563, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.28873661160469055, + "step": 1532 + }, + { + "epoch": 0.095875, + "grad_norm": 3.40625, + "grad_norm_var": 0.04674072265625, + "learning_rate": 0.0001, + "loss": 8.8375, + "loss/crossentropy": 2.4132364988327026, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.285625159740448, + "step": 1534 + }, + { + "epoch": 0.096, + "grad_norm": 3.375, + "grad_norm_var": 0.039728800455729164, + "learning_rate": 0.0001, + "loss": 8.7776, + "loss/crossentropy": 2.4001948833465576, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.31654833257198334, + "step": 1536 + }, + { + "epoch": 0.096125, + "grad_norm": 3.796875, + "grad_norm_var": 0.03581441243489583, + "learning_rate": 0.0001, + "loss": 8.8068, + "loss/crossentropy": 2.258412718772888, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2980117201805115, + "step": 1538 + }, + { + "epoch": 0.09625, + "grad_norm": 3.5625, + "grad_norm_var": 0.032958984375, + "learning_rate": 0.0001, + "loss": 8.3933, + "loss/crossentropy": 2.3027628660202026, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.28253039717674255, + "step": 1540 + }, + { + "epoch": 0.096375, + "grad_norm": 3.453125, + "grad_norm_var": 0.024409993489583334, + "learning_rate": 0.0001, + "loss": 8.6451, + "loss/crossentropy": 2.3603252172470093, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2751441150903702, + "step": 1542 + }, + { + "epoch": 0.0965, + "grad_norm": 3.5, + "grad_norm_var": 0.029710896809895835, + "learning_rate": 0.0001, + "loss": 8.6759, + "loss/crossentropy": 2.5256091356277466, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.26941487193107605, + "step": 1544 + }, + { + "epoch": 0.096625, + "grad_norm": 4.0, + "grad_norm_var": 0.04315999348958333, + "learning_rate": 0.0001, + "loss": 9.1672, + "loss/crossentropy": 2.4796286821365356, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.3184451460838318, + "step": 1546 + }, + { + "epoch": 0.09675, + "grad_norm": 3.5625, + "grad_norm_var": 0.04156901041666667, + "learning_rate": 0.0001, + "loss": 8.8819, + "loss/crossentropy": 2.4188071489334106, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.31058116257190704, + "step": 1548 + }, + { + "epoch": 0.096875, + "grad_norm": 3.3125, + "grad_norm_var": 0.045441691080729166, + "learning_rate": 0.0001, + "loss": 8.7315, + "loss/crossentropy": 2.318482995033264, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.2847931683063507, + "step": 1550 + }, + { + "epoch": 0.097, + "grad_norm": 3.578125, + "grad_norm_var": 0.05179036458333333, + "learning_rate": 0.0001, + "loss": 8.8762, + "loss/crossentropy": 2.24004590511322, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.28266097605228424, + "step": 1552 + }, + { + "epoch": 0.097125, + "grad_norm": 3.171875, + "grad_norm_var": 0.05720926920572917, + "learning_rate": 0.0001, + "loss": 8.6779, + "loss/crossentropy": 2.10389643907547, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2705800235271454, + "step": 1554 + }, + { + "epoch": 0.09725, + "grad_norm": 3.671875, + "grad_norm_var": 0.06609598795572917, + "learning_rate": 0.0001, + "loss": 8.7698, + "loss/crossentropy": 2.2352887392044067, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.30223119258880615, + "step": 1556 + }, + { + "epoch": 0.097375, + "grad_norm": 3.953125, + "grad_norm_var": 0.09449869791666667, + "learning_rate": 0.0001, + "loss": 9.038, + "loss/crossentropy": 2.4962942600250244, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.3383851647377014, + "step": 1558 + }, + { + "epoch": 0.0975, + "grad_norm": 3.484375, + "grad_norm_var": 0.07860921223958334, + "learning_rate": 0.0001, + "loss": 8.6868, + "loss/crossentropy": 2.388336658477783, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.2802818864583969, + "step": 1560 + }, + { + "epoch": 0.097625, + "grad_norm": 3.40625, + "grad_norm_var": 0.06916910807291667, + "learning_rate": 0.0001, + "loss": 8.8473, + "loss/crossentropy": 2.4475581645965576, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.2849755436182022, + "step": 1562 + }, + { + "epoch": 0.09775, + "grad_norm": 3.203125, + "grad_norm_var": 0.0787506103515625, + "learning_rate": 0.0001, + "loss": 8.5789, + "loss/crossentropy": 2.5317403078079224, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.29401715099811554, + "step": 1564 + }, + { + "epoch": 0.097875, + "grad_norm": 3.671875, + "grad_norm_var": 0.08131103515625, + "learning_rate": 0.0001, + "loss": 8.9867, + "loss/crossentropy": 2.547517776489258, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.3130339980125427, + "step": 1566 + }, + { + "epoch": 0.098, + "grad_norm": 3.625, + "grad_norm_var": 0.07197265625, + "learning_rate": 0.0001, + "loss": 8.6146, + "loss/crossentropy": 2.1298526525497437, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.2899221330881119, + "step": 1568 + }, + { + "epoch": 0.098125, + "grad_norm": 3.4375, + "grad_norm_var": 0.06285400390625, + "learning_rate": 0.0001, + "loss": 8.9054, + "loss/crossentropy": 2.4564274549484253, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.313205823302269, + "step": 1570 + }, + { + "epoch": 0.09825, + "grad_norm": 3.546875, + "grad_norm_var": 0.05227864583333333, + "learning_rate": 0.0001, + "loss": 9.0662, + "loss/crossentropy": 2.246076822280884, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.2935841381549835, + "step": 1572 + }, + { + "epoch": 0.098375, + "grad_norm": 3.3125, + "grad_norm_var": 0.027961222330729167, + "learning_rate": 0.0001, + "loss": 8.9055, + "loss/crossentropy": 2.38775098323822, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.32380862534046173, + "step": 1574 + }, + { + "epoch": 0.0985, + "grad_norm": 3.859375, + "grad_norm_var": 0.035700480143229164, + "learning_rate": 0.0001, + "loss": 8.7802, + "loss/crossentropy": 2.3213651180267334, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.2993357926607132, + "step": 1576 + }, + { + "epoch": 0.098625, + "grad_norm": 3.21875, + "grad_norm_var": 0.04185282389322917, + "learning_rate": 0.0001, + "loss": 8.5289, + "loss/crossentropy": 2.0965049266815186, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.279249906539917, + "step": 1578 + }, + { + "epoch": 0.09875, + "grad_norm": 3.484375, + "grad_norm_var": 0.03489481608072917, + "learning_rate": 0.0001, + "loss": 8.6456, + "loss/crossentropy": 2.1175107955932617, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.273018017411232, + "step": 1580 + }, + { + "epoch": 0.098875, + "grad_norm": 3.328125, + "grad_norm_var": 0.032796223958333336, + "learning_rate": 0.0001, + "loss": 8.9137, + "loss/crossentropy": 2.547134518623352, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.3091147541999817, + "step": 1582 + }, + { + "epoch": 0.099, + "grad_norm": 3.421875, + "grad_norm_var": 0.033589680989583336, + "learning_rate": 0.0001, + "loss": 8.678, + "loss/crossentropy": 2.406341552734375, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.2856515645980835, + "step": 1584 + }, + { + "epoch": 0.099125, + "grad_norm": 3.859375, + "grad_norm_var": 0.06271870930989583, + "learning_rate": 0.0001, + "loss": 8.8898, + "loss/crossentropy": 2.435731291770935, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.29519233107566833, + "step": 1586 + }, + { + "epoch": 0.09925, + "grad_norm": 3.9375, + "grad_norm_var": 0.0727447509765625, + "learning_rate": 0.0001, + "loss": 8.6688, + "loss/crossentropy": 2.3164761066436768, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.3253418505191803, + "step": 1588 + }, + { + "epoch": 0.099375, + "grad_norm": 3.390625, + "grad_norm_var": 0.070654296875, + "learning_rate": 0.0001, + "loss": 8.6513, + "loss/crossentropy": 2.2719457149505615, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.30052798986434937, + "step": 1590 + }, + { + "epoch": 0.0995, + "grad_norm": 3.90625, + "grad_norm_var": 0.07297770182291667, + "learning_rate": 0.0001, + "loss": 9.1721, + "loss/crossentropy": 2.5084248781204224, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.33814041316509247, + "step": 1592 + }, + { + "epoch": 0.099625, + "grad_norm": 3.40625, + "grad_norm_var": 0.06483968098958333, + "learning_rate": 0.0001, + "loss": 8.9066, + "loss/crossentropy": 2.382421851158142, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.3085876405239105, + "step": 1594 + }, + { + "epoch": 0.09975, + "grad_norm": 3.359375, + "grad_norm_var": 0.07033589680989584, + "learning_rate": 0.0001, + "loss": 8.7353, + "loss/crossentropy": 2.3208523988723755, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3062067925930023, + "step": 1596 + }, + { + "epoch": 0.099875, + "grad_norm": 3.3125, + "grad_norm_var": 0.06741434733072917, + "learning_rate": 0.0001, + "loss": 8.7032, + "loss/crossentropy": 2.4263393878936768, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.31763342022895813, + "step": 1598 + }, + { + "epoch": 0.1, + "grad_norm": 3.21875, + "grad_norm_var": 0.07450764973958333, + "learning_rate": 0.0001, + "loss": 8.6868, + "loss/crossentropy": 2.247215509414673, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.28044338524341583, + "step": 1600 + }, + { + "epoch": 0.100125, + "grad_norm": 3.640625, + "grad_norm_var": 0.04682515462239583, + "learning_rate": 0.0001, + "loss": 8.7077, + "loss/crossentropy": 2.3089388608932495, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.28411509096622467, + "step": 1602 + }, + { + "epoch": 0.10025, + "grad_norm": 3.609375, + "grad_norm_var": 0.046296183268229166, + "learning_rate": 0.0001, + "loss": 9.043, + "loss/crossentropy": 2.711973190307617, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.310742050409317, + "step": 1604 + }, + { + "epoch": 0.100375, + "grad_norm": 3.484375, + "grad_norm_var": 0.05012919108072917, + "learning_rate": 0.0001, + "loss": 8.7587, + "loss/crossentropy": 2.206353545188904, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.30005398392677307, + "step": 1606 + }, + { + "epoch": 0.1005, + "grad_norm": 3.671875, + "grad_norm_var": 0.042740885416666666, + "learning_rate": 0.0001, + "loss": 8.7516, + "loss/crossentropy": 2.3755295276641846, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.28789104521274567, + "step": 1608 + }, + { + "epoch": 0.100625, + "grad_norm": 3.34375, + "grad_norm_var": 0.04962565104166667, + "learning_rate": 0.0001, + "loss": 8.9675, + "loss/crossentropy": 2.5117024183273315, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.2872663736343384, + "step": 1610 + }, + { + "epoch": 0.10075, + "grad_norm": 3.859375, + "grad_norm_var": 0.05524088541666667, + "learning_rate": 0.0001, + "loss": 8.8949, + "loss/crossentropy": 2.1095504760742188, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.26681819558143616, + "step": 1612 + }, + { + "epoch": 0.100875, + "grad_norm": 3.71875, + "grad_norm_var": 0.05439351399739583, + "learning_rate": 0.0001, + "loss": 9.0907, + "loss/crossentropy": 2.536198377609253, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.3001484125852585, + "step": 1614 + }, + { + "epoch": 0.101, + "grad_norm": 3.4375, + "grad_norm_var": 0.044331868489583336, + "learning_rate": 0.0001, + "loss": 8.6285, + "loss/crossentropy": 2.444958448410034, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2898731231689453, + "step": 1616 + }, + { + "epoch": 0.101125, + "grad_norm": 3.5, + "grad_norm_var": 0.04670308430989583, + "learning_rate": 0.0001, + "loss": 8.5124, + "loss/crossentropy": 2.492483615875244, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.29097048938274384, + "step": 1618 + }, + { + "epoch": 0.10125, + "grad_norm": 3.328125, + "grad_norm_var": 0.039159138997395836, + "learning_rate": 0.0001, + "loss": 8.5946, + "loss/crossentropy": 2.3776297569274902, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2919171154499054, + "step": 1620 + }, + { + "epoch": 0.101375, + "grad_norm": 3.625, + "grad_norm_var": 0.039704386393229166, + "learning_rate": 0.0001, + "loss": 8.8745, + "loss/crossentropy": 2.458945870399475, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.31617042422294617, + "step": 1622 + }, + { + "epoch": 0.1015, + "grad_norm": 4.5, + "grad_norm_var": 0.09348551432291667, + "learning_rate": 0.0001, + "loss": 8.7595, + "loss/crossentropy": 2.2380698919296265, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.32717761397361755, + "step": 1624 + }, + { + "epoch": 0.101625, + "grad_norm": 3.40625, + "grad_norm_var": 0.0921051025390625, + "learning_rate": 0.0001, + "loss": 8.8646, + "loss/crossentropy": 2.504348874092102, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.29782192409038544, + "step": 1626 + }, + { + "epoch": 0.10175, + "grad_norm": 3.4375, + "grad_norm_var": 0.09677734375, + "learning_rate": 0.0001, + "loss": 8.5019, + "loss/crossentropy": 2.281771183013916, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.2809130549430847, + "step": 1628 + }, + { + "epoch": 0.101875, + "grad_norm": 3.671875, + "grad_norm_var": 0.09638671875, + "learning_rate": 0.0001, + "loss": 8.7456, + "loss/crossentropy": 2.131038784980774, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.2949763238430023, + "step": 1630 + }, + { + "epoch": 0.102, + "grad_norm": 4.0625, + "grad_norm_var": 0.11483968098958333, + "learning_rate": 0.0001, + "loss": 8.5063, + "loss/crossentropy": 2.3742181062698364, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.30859068036079407, + "step": 1632 + }, + { + "epoch": 0.102125, + "grad_norm": 3.609375, + "grad_norm_var": 0.1124176025390625, + "learning_rate": 0.0001, + "loss": 8.5566, + "loss/crossentropy": 2.250689148902893, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.2848288118839264, + "step": 1634 + }, + { + "epoch": 0.10225, + "grad_norm": 3.46875, + "grad_norm_var": 0.10181884765625, + "learning_rate": 0.0001, + "loss": 8.7109, + "loss/crossentropy": 2.4584755897521973, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.29475922882556915, + "step": 1636 + }, + { + "epoch": 0.102375, + "grad_norm": 3.65625, + "grad_norm_var": 0.10305074055989584, + "learning_rate": 0.0001, + "loss": 8.5859, + "loss/crossentropy": 2.243297576904297, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2687932550907135, + "step": 1638 + }, + { + "epoch": 0.1025, + "grad_norm": 3.28125, + "grad_norm_var": 0.05028889973958333, + "learning_rate": 0.0001, + "loss": 8.807, + "loss/crossentropy": 2.3591182231903076, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.2878338694572449, + "step": 1640 + }, + { + "epoch": 0.102625, + "grad_norm": 3.40625, + "grad_norm_var": 0.0458892822265625, + "learning_rate": 0.0001, + "loss": 8.7064, + "loss/crossentropy": 2.53536856174469, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.28173747658729553, + "step": 1642 + }, + { + "epoch": 0.10275, + "grad_norm": 3.65625, + "grad_norm_var": 0.043879191080729164, + "learning_rate": 0.0001, + "loss": 8.5829, + "loss/crossentropy": 2.5345053672790527, + "loss/hidden": 3.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.30797943472862244, + "step": 1644 + }, + { + "epoch": 0.102875, + "grad_norm": 3.6875, + "grad_norm_var": 0.04512430826822917, + "learning_rate": 0.0001, + "loss": 8.5724, + "loss/crossentropy": 2.3095182180404663, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.2998042106628418, + "step": 1646 + }, + { + "epoch": 0.103, + "grad_norm": 3.484375, + "grad_norm_var": 0.0231842041015625, + "learning_rate": 0.0001, + "loss": 8.6461, + "loss/crossentropy": 2.2530897855758667, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.30125299096107483, + "step": 1648 + }, + { + "epoch": 0.103125, + "grad_norm": 3.53125, + "grad_norm_var": 0.021875, + "learning_rate": 0.0001, + "loss": 8.878, + "loss/crossentropy": 2.557194471359253, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.28575199842453003, + "step": 1650 + }, + { + "epoch": 0.10325, + "grad_norm": 3.359375, + "grad_norm_var": 0.0226226806640625, + "learning_rate": 0.0001, + "loss": 8.5783, + "loss/crossentropy": 2.4618316888809204, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.305898517370224, + "step": 1652 + }, + { + "epoch": 0.103375, + "grad_norm": 3.65625, + "grad_norm_var": 0.02310791015625, + "learning_rate": 0.0001, + "loss": 8.5743, + "loss/crossentropy": 2.524722933769226, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.29061686992645264, + "step": 1654 + }, + { + "epoch": 0.1035, + "grad_norm": 3.4375, + "grad_norm_var": 0.022142537434895835, + "learning_rate": 0.0001, + "loss": 8.4769, + "loss/crossentropy": 2.602474570274353, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2977643758058548, + "step": 1656 + }, + { + "epoch": 0.103625, + "grad_norm": 3.4375, + "grad_norm_var": 0.022086588541666667, + "learning_rate": 0.0001, + "loss": 8.7637, + "loss/crossentropy": 2.364400863647461, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.31225910782814026, + "step": 1658 + }, + { + "epoch": 0.10375, + "grad_norm": 3.828125, + "grad_norm_var": 0.02642822265625, + "learning_rate": 0.0001, + "loss": 8.6777, + "loss/crossentropy": 2.4924440383911133, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.3066846579313278, + "step": 1660 + }, + { + "epoch": 0.103875, + "grad_norm": 3.59375, + "grad_norm_var": 0.02388916015625, + "learning_rate": 0.0001, + "loss": 8.4898, + "loss/crossentropy": 2.183597683906555, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.2970152199268341, + "step": 1662 + }, + { + "epoch": 0.104, + "grad_norm": 3.34375, + "grad_norm_var": 0.02330322265625, + "learning_rate": 0.0001, + "loss": 8.6848, + "loss/crossentropy": 2.5557087659835815, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.29651692509651184, + "step": 1664 + }, + { + "epoch": 0.104125, + "grad_norm": 3.328125, + "grad_norm_var": 0.0264312744140625, + "learning_rate": 0.0001, + "loss": 8.6747, + "loss/crossentropy": 2.3465185165405273, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.30662697553634644, + "step": 1666 + }, + { + "epoch": 0.10425, + "grad_norm": 3.3125, + "grad_norm_var": 0.0306793212890625, + "learning_rate": 0.0001, + "loss": 8.6526, + "loss/crossentropy": 2.4796379804611206, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.3008027672767639, + "step": 1668 + }, + { + "epoch": 0.104375, + "grad_norm": 3.9375, + "grad_norm_var": 0.04387613932291667, + "learning_rate": 0.0001, + "loss": 8.9315, + "loss/crossentropy": 2.4333351850509644, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.29662713408470154, + "step": 1670 + }, + { + "epoch": 0.1045, + "grad_norm": 3.3125, + "grad_norm_var": 0.08879801432291666, + "learning_rate": 0.0001, + "loss": 8.6483, + "loss/crossentropy": 2.0529088377952576, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.283397376537323, + "step": 1672 + }, + { + "epoch": 0.104625, + "grad_norm": 3.359375, + "grad_norm_var": 0.089306640625, + "learning_rate": 0.0001, + "loss": 8.8812, + "loss/crossentropy": 2.5718494653701782, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.32283854484558105, + "step": 1674 + }, + { + "epoch": 0.10475, + "grad_norm": 3.515625, + "grad_norm_var": 0.07821858723958333, + "learning_rate": 0.0001, + "loss": 8.6264, + "loss/crossentropy": 2.301028847694397, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.3003203123807907, + "step": 1676 + }, + { + "epoch": 0.104875, + "grad_norm": 3.171875, + "grad_norm_var": 0.08322652180989583, + "learning_rate": 0.0001, + "loss": 8.5515, + "loss/crossentropy": 2.2602317333221436, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.26862217485904694, + "step": 1678 + }, + { + "epoch": 0.105, + "grad_norm": 3.5625, + "grad_norm_var": 0.08742574055989584, + "learning_rate": 0.0001, + "loss": 8.5586, + "loss/crossentropy": 2.6255671977996826, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.3011835664510727, + "step": 1680 + }, + { + "epoch": 0.105125, + "grad_norm": 3.109375, + "grad_norm_var": 0.09278055826822916, + "learning_rate": 0.0001, + "loss": 8.6635, + "loss/crossentropy": 2.5335636138916016, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.28424978256225586, + "step": 1682 + }, + { + "epoch": 0.10525, + "grad_norm": 3.421875, + "grad_norm_var": 0.08586324055989583, + "learning_rate": 0.0001, + "loss": 8.6685, + "loss/crossentropy": 2.275408983230591, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.2945568561553955, + "step": 1684 + }, + { + "epoch": 0.105375, + "grad_norm": 3.46875, + "grad_norm_var": 0.07172749837239584, + "learning_rate": 0.0001, + "loss": 8.6884, + "loss/crossentropy": 2.459862232208252, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2846823185682297, + "step": 1686 + }, + { + "epoch": 0.1055, + "grad_norm": 3.984375, + "grad_norm_var": 0.046418253580729166, + "learning_rate": 0.0001, + "loss": 8.5572, + "loss/crossentropy": 2.2836594581604004, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2825407385826111, + "step": 1688 + }, + { + "epoch": 0.105625, + "grad_norm": 3.28125, + "grad_norm_var": 0.04759012858072917, + "learning_rate": 0.0001, + "loss": 8.8016, + "loss/crossentropy": 2.0917385816574097, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.290659636259079, + "step": 1690 + }, + { + "epoch": 0.10575, + "grad_norm": 3.265625, + "grad_norm_var": 0.05226949055989583, + "learning_rate": 0.0001, + "loss": 8.6239, + "loss/crossentropy": 2.3286694288253784, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.29725848138332367, + "step": 1692 + }, + { + "epoch": 0.105875, + "grad_norm": 3.4375, + "grad_norm_var": 0.047093709309895836, + "learning_rate": 0.0001, + "loss": 8.6667, + "loss/crossentropy": 2.1655561327934265, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2947119176387787, + "step": 1694 + }, + { + "epoch": 0.106, + "grad_norm": 3.578125, + "grad_norm_var": 0.04413655598958333, + "learning_rate": 0.0001, + "loss": 8.9215, + "loss/crossentropy": 2.4700855016708374, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.2996203601360321, + "step": 1696 + }, + { + "epoch": 0.106125, + "grad_norm": 3.34375, + "grad_norm_var": 0.0376129150390625, + "learning_rate": 0.0001, + "loss": 8.6172, + "loss/crossentropy": 2.4698420763015747, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2655749022960663, + "step": 1698 + }, + { + "epoch": 0.10625, + "grad_norm": 3.546875, + "grad_norm_var": 0.03808186848958333, + "learning_rate": 0.0001, + "loss": 8.5207, + "loss/crossentropy": 2.3951499462127686, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.3072836995124817, + "step": 1700 + }, + { + "epoch": 0.106375, + "grad_norm": 3.515625, + "grad_norm_var": 0.038182576497395836, + "learning_rate": 0.0001, + "loss": 8.6023, + "loss/crossentropy": 2.334869146347046, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.2989247292280197, + "step": 1702 + }, + { + "epoch": 0.1065, + "grad_norm": 4.6875, + "grad_norm_var": 0.11238606770833333, + "learning_rate": 0.0001, + "loss": 8.6929, + "loss/crossentropy": 2.1444605588912964, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.31034591794013977, + "step": 1704 + }, + { + "epoch": 0.106625, + "grad_norm": 3.359375, + "grad_norm_var": 0.11077473958333334, + "learning_rate": 0.0001, + "loss": 8.6212, + "loss/crossentropy": 2.298153877258301, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.27205270528793335, + "step": 1706 + }, + { + "epoch": 0.10675, + "grad_norm": 3.65625, + "grad_norm_var": 0.10321858723958334, + "learning_rate": 0.0001, + "loss": 8.6963, + "loss/crossentropy": 2.4254097938537598, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.32095080614089966, + "step": 1708 + }, + { + "epoch": 0.106875, + "grad_norm": 3.5, + "grad_norm_var": 0.10187174479166666, + "learning_rate": 0.0001, + "loss": 8.4695, + "loss/crossentropy": 2.1711431741714478, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2679228186607361, + "step": 1710 + }, + { + "epoch": 0.107, + "grad_norm": 3.3125, + "grad_norm_var": 0.10679423014322917, + "learning_rate": 0.0001, + "loss": 8.3566, + "loss/crossentropy": 2.1170949935913086, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.3108642250299454, + "step": 1712 + }, + { + "epoch": 0.107125, + "grad_norm": 3.109375, + "grad_norm_var": 0.11456705729166666, + "learning_rate": 0.0001, + "loss": 8.644, + "loss/crossentropy": 2.5449509620666504, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.3044998347759247, + "step": 1714 + }, + { + "epoch": 0.10725, + "grad_norm": 4.0625, + "grad_norm_var": 0.13430582682291667, + "learning_rate": 0.0001, + "loss": 8.8835, + "loss/crossentropy": 2.4760228395462036, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.32176442444324493, + "step": 1716 + }, + { + "epoch": 0.107375, + "grad_norm": 3.53125, + "grad_norm_var": 0.13414306640625, + "learning_rate": 0.0001, + "loss": 8.6377, + "loss/crossentropy": 2.3086230754852295, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.3137579560279846, + "step": 1718 + }, + { + "epoch": 0.1075, + "grad_norm": 3.59375, + "grad_norm_var": 0.04918212890625, + "learning_rate": 0.0001, + "loss": 8.9384, + "loss/crossentropy": 2.650801420211792, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.31000974774360657, + "step": 1720 + }, + { + "epoch": 0.107625, + "grad_norm": 3.765625, + "grad_norm_var": 0.054248046875, + "learning_rate": 0.0001, + "loss": 9.2105, + "loss/crossentropy": 2.595288038253784, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.3103038817644119, + "step": 1722 + }, + { + "epoch": 0.10775, + "grad_norm": 3.3125, + "grad_norm_var": 0.057038370768229166, + "learning_rate": 0.0001, + "loss": 8.8991, + "loss/crossentropy": 2.5326555967330933, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.3022647351026535, + "step": 1724 + }, + { + "epoch": 0.107875, + "grad_norm": 3.703125, + "grad_norm_var": 0.059403483072916666, + "learning_rate": 0.0001, + "loss": 8.8281, + "loss/crossentropy": 2.4913820028305054, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.2860126197338104, + "step": 1726 + }, + { + "epoch": 0.108, + "grad_norm": 3.21875, + "grad_norm_var": 0.06424153645833333, + "learning_rate": 0.0001, + "loss": 8.7805, + "loss/crossentropy": 2.1372629404067993, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.28189751505851746, + "step": 1728 + }, + { + "epoch": 0.108125, + "grad_norm": 3.625, + "grad_norm_var": 0.0627593994140625, + "learning_rate": 0.0001, + "loss": 8.8053, + "loss/crossentropy": 2.432206392288208, + "loss/hidden": 3.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.35032832622528076, + "step": 1730 + }, + { + "epoch": 0.10825, + "grad_norm": 3.5, + "grad_norm_var": 0.0415191650390625, + "learning_rate": 0.0001, + "loss": 8.6348, + "loss/crossentropy": 2.4481922388076782, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.28543633222579956, + "step": 1732 + }, + { + "epoch": 0.108375, + "grad_norm": 3.375, + "grad_norm_var": 0.04326171875, + "learning_rate": 0.0001, + "loss": 8.691, + "loss/crossentropy": 2.38582444190979, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.3336361348628998, + "step": 1734 + }, + { + "epoch": 0.1085, + "grad_norm": 3.5, + "grad_norm_var": 0.05136311848958333, + "learning_rate": 0.0001, + "loss": 8.5918, + "loss/crossentropy": 2.606018543243408, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.30366943776607513, + "step": 1736 + }, + { + "epoch": 0.108625, + "grad_norm": 3.59375, + "grad_norm_var": 0.03434956868489583, + "learning_rate": 0.0001, + "loss": 8.7135, + "loss/crossentropy": 2.3895175457000732, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3109103590250015, + "step": 1738 + }, + { + "epoch": 0.10875, + "grad_norm": 3.203125, + "grad_norm_var": 0.037328084309895836, + "learning_rate": 0.0001, + "loss": 8.4959, + "loss/crossentropy": 2.0861340761184692, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.28380608558654785, + "step": 1740 + }, + { + "epoch": 0.108875, + "grad_norm": 3.484375, + "grad_norm_var": 0.03951416015625, + "learning_rate": 0.0001, + "loss": 8.7312, + "loss/crossentropy": 2.449671506881714, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.28429578244686127, + "step": 1742 + }, + { + "epoch": 0.109, + "grad_norm": 3.328125, + "grad_norm_var": 0.037385050455729166, + "learning_rate": 0.0001, + "loss": 8.4461, + "loss/crossentropy": 2.4647138118743896, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.2784615755081177, + "step": 1744 + }, + { + "epoch": 0.109125, + "grad_norm": 3.3125, + "grad_norm_var": 0.033177693684895836, + "learning_rate": 0.0001, + "loss": 8.7449, + "loss/crossentropy": 2.3156272172927856, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2726980447769165, + "step": 1746 + }, + { + "epoch": 0.10925, + "grad_norm": 3.40625, + "grad_norm_var": 0.03411458333333333, + "learning_rate": 0.0001, + "loss": 8.613, + "loss/crossentropy": 1.9271941781044006, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2630353271961212, + "step": 1748 + }, + { + "epoch": 0.109375, + "grad_norm": 3.28125, + "grad_norm_var": 0.03327534993489583, + "learning_rate": 0.0001, + "loss": 8.501, + "loss/crossentropy": 2.2124346494674683, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.29161450266838074, + "step": 1750 + }, + { + "epoch": 0.1095, + "grad_norm": 3.40625, + "grad_norm_var": 0.0279296875, + "learning_rate": 0.0001, + "loss": 8.8103, + "loss/crossentropy": 2.3019338846206665, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.29614363610744476, + "step": 1752 + }, + { + "epoch": 0.109625, + "grad_norm": 3.875, + "grad_norm_var": 0.0394927978515625, + "learning_rate": 0.0001, + "loss": 8.6762, + "loss/crossentropy": 2.299665927886963, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2929713726043701, + "step": 1754 + }, + { + "epoch": 0.10975, + "grad_norm": 3.9375, + "grad_norm_var": 0.05947163899739583, + "learning_rate": 0.0001, + "loss": 8.7112, + "loss/crossentropy": 2.4006038904190063, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2871282994747162, + "step": 1756 + }, + { + "epoch": 0.109875, + "grad_norm": 3.5, + "grad_norm_var": 0.04921875, + "learning_rate": 0.0001, + "loss": 8.6092, + "loss/crossentropy": 2.2746243476867676, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.3178953379392624, + "step": 1758 + }, + { + "epoch": 0.11, + "grad_norm": 3.21875, + "grad_norm_var": 0.05380859375, + "learning_rate": 0.0001, + "loss": 8.4347, + "loss/crossentropy": 2.152518630027771, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.308669313788414, + "step": 1760 + }, + { + "epoch": 0.110125, + "grad_norm": 3.15625, + "grad_norm_var": 0.05754801432291667, + "learning_rate": 0.0001, + "loss": 8.515, + "loss/crossentropy": 2.313698410987854, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.312808558344841, + "step": 1762 + }, + { + "epoch": 0.11025, + "grad_norm": 3.515625, + "grad_norm_var": 0.05791015625, + "learning_rate": 0.0001, + "loss": 8.4938, + "loss/crossentropy": 2.2726303339004517, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2869200110435486, + "step": 1764 + }, + { + "epoch": 0.110375, + "grad_norm": 3.671875, + "grad_norm_var": 0.05747782389322917, + "learning_rate": 0.0001, + "loss": 8.547, + "loss/crossentropy": 2.309714913368225, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2861059010028839, + "step": 1766 + }, + { + "epoch": 0.1105, + "grad_norm": 3.5, + "grad_norm_var": 0.056493123372395836, + "learning_rate": 0.0001, + "loss": 8.7686, + "loss/crossentropy": 2.429977536201477, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.30839361250400543, + "step": 1768 + }, + { + "epoch": 0.110625, + "grad_norm": 3.421875, + "grad_norm_var": 0.049437459309895834, + "learning_rate": 0.0001, + "loss": 8.486, + "loss/crossentropy": 2.1271519660949707, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.26674604415893555, + "step": 1770 + }, + { + "epoch": 0.11075, + "grad_norm": 3.390625, + "grad_norm_var": 0.02265625, + "learning_rate": 0.0001, + "loss": 8.6498, + "loss/crossentropy": 2.4554080963134766, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2772922068834305, + "step": 1772 + }, + { + "epoch": 0.110875, + "grad_norm": 3.359375, + "grad_norm_var": 0.021906534830729168, + "learning_rate": 0.0001, + "loss": 8.4247, + "loss/crossentropy": 2.179062843322754, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2816423773765564, + "step": 1774 + }, + { + "epoch": 0.111, + "grad_norm": 3.4375, + "grad_norm_var": 0.0189117431640625, + "learning_rate": 0.0001, + "loss": 8.6942, + "loss/crossentropy": 2.2097833156585693, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.30545778572559357, + "step": 1776 + }, + { + "epoch": 0.111125, + "grad_norm": 3.46875, + "grad_norm_var": 0.0134429931640625, + "learning_rate": 0.0001, + "loss": 8.5308, + "loss/crossentropy": 2.35193407535553, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.27761800587177277, + "step": 1778 + }, + { + "epoch": 0.11125, + "grad_norm": 4.0625, + "grad_norm_var": 0.0426666259765625, + "learning_rate": 0.0001, + "loss": 8.7591, + "loss/crossentropy": 2.8923500776290894, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.31674572825431824, + "step": 1780 + }, + { + "epoch": 0.111375, + "grad_norm": 3.390625, + "grad_norm_var": 0.04739176432291667, + "learning_rate": 0.0001, + "loss": 8.7339, + "loss/crossentropy": 2.4018853902816772, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.3184930384159088, + "step": 1782 + }, + { + "epoch": 0.1115, + "grad_norm": 3.109375, + "grad_norm_var": 0.0567047119140625, + "learning_rate": 0.0001, + "loss": 8.5982, + "loss/crossentropy": 2.486867070198059, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.28121723234653473, + "step": 1784 + }, + { + "epoch": 0.111625, + "grad_norm": 3.59375, + "grad_norm_var": 0.05657145182291667, + "learning_rate": 0.0001, + "loss": 8.9316, + "loss/crossentropy": 2.6467264890670776, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.33492469787597656, + "step": 1786 + }, + { + "epoch": 0.11175, + "grad_norm": 3.40625, + "grad_norm_var": 0.05624593098958333, + "learning_rate": 0.0001, + "loss": 8.7063, + "loss/crossentropy": 2.251604914665222, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.30116818845272064, + "step": 1788 + }, + { + "epoch": 0.111875, + "grad_norm": 3.5, + "grad_norm_var": 0.056966145833333336, + "learning_rate": 0.0001, + "loss": 8.4636, + "loss/crossentropy": 2.1624085903167725, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2598511800169945, + "step": 1790 + }, + { + "epoch": 0.112, + "grad_norm": 3.59375, + "grad_norm_var": 0.056818644205729164, + "learning_rate": 0.0001, + "loss": 8.4571, + "loss/crossentropy": 2.0495232343673706, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.24547497928142548, + "step": 1792 + }, + { + "epoch": 0.112125, + "grad_norm": 3.484375, + "grad_norm_var": 0.059794108072916664, + "learning_rate": 0.0001, + "loss": 8.8322, + "loss/crossentropy": 2.664864659309387, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.30460360646247864, + "step": 1794 + }, + { + "epoch": 0.11225, + "grad_norm": 3.515625, + "grad_norm_var": 0.0298492431640625, + "learning_rate": 0.0001, + "loss": 8.646, + "loss/crossentropy": 2.494004487991333, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2763901799917221, + "step": 1796 + }, + { + "epoch": 0.112375, + "grad_norm": 3.140625, + "grad_norm_var": 0.032515462239583334, + "learning_rate": 0.0001, + "loss": 8.3741, + "loss/crossentropy": 2.1645578145980835, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2648225873708725, + "step": 1798 + }, + { + "epoch": 0.1125, + "grad_norm": 3.453125, + "grad_norm_var": 0.024413045247395834, + "learning_rate": 0.0001, + "loss": 8.4112, + "loss/crossentropy": 2.09485399723053, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2805396020412445, + "step": 1800 + }, + { + "epoch": 0.112625, + "grad_norm": 3.453125, + "grad_norm_var": 0.022313435872395832, + "learning_rate": 0.0001, + "loss": 8.6201, + "loss/crossentropy": 2.369633436203003, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2918958216905594, + "step": 1802 + }, + { + "epoch": 0.11275, + "grad_norm": 3.671875, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 8.4743, + "loss/crossentropy": 2.522893786430359, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2649436295032501, + "step": 1804 + }, + { + "epoch": 0.112875, + "grad_norm": 3.65625, + "grad_norm_var": 0.03322652180989583, + "learning_rate": 0.0001, + "loss": 8.6938, + "loss/crossentropy": 2.4980560541152954, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3046301603317261, + "step": 1806 + }, + { + "epoch": 0.113, + "grad_norm": 3.765625, + "grad_norm_var": 0.07986551920572917, + "learning_rate": 0.0001, + "loss": 8.6725, + "loss/crossentropy": 2.320141911506653, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.3137675076723099, + "step": 1808 + }, + { + "epoch": 0.113125, + "grad_norm": 3.71875, + "grad_norm_var": 0.08251546223958334, + "learning_rate": 0.0001, + "loss": 8.7972, + "loss/crossentropy": 2.2626017332077026, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.2625807821750641, + "step": 1810 + }, + { + "epoch": 0.11325, + "grad_norm": 3.734375, + "grad_norm_var": 0.08018290201822917, + "learning_rate": 0.0001, + "loss": 8.6687, + "loss/crossentropy": 2.136113405227661, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.30796822905540466, + "step": 1812 + }, + { + "epoch": 0.113375, + "grad_norm": 5.5625, + "grad_norm_var": 0.3128326416015625, + "learning_rate": 0.0001, + "loss": 9.0458, + "loss/crossentropy": 2.75032639503479, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.31279677152633667, + "step": 1814 + }, + { + "epoch": 0.1135, + "grad_norm": 4.9375, + "grad_norm_var": 0.3737457275390625, + "learning_rate": 0.0001, + "loss": 9.1144, + "loss/crossentropy": 2.538609027862549, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.32450392842292786, + "step": 1816 + }, + { + "epoch": 0.113625, + "grad_norm": 5.0625, + "grad_norm_var": 0.44909566243489585, + "learning_rate": 0.0001, + "loss": 8.8125, + "loss/crossentropy": 2.322131633758545, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.3943719118833542, + "step": 1818 + }, + { + "epoch": 0.11375, + "grad_norm": 3.453125, + "grad_norm_var": 0.445068359375, + "learning_rate": 0.0001, + "loss": 8.4801, + "loss/crossentropy": 2.363473057746887, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.30813442170619965, + "step": 1820 + }, + { + "epoch": 0.113875, + "grad_norm": 3.15625, + "grad_norm_var": 0.4840169270833333, + "learning_rate": 0.0001, + "loss": 8.5864, + "loss/crossentropy": 2.3755160570144653, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2846773564815521, + "step": 1822 + }, + { + "epoch": 0.114, + "grad_norm": 3.21875, + "grad_norm_var": 0.5322580973307292, + "learning_rate": 0.0001, + "loss": 8.4673, + "loss/crossentropy": 2.3385982513427734, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2795634865760803, + "step": 1824 + }, + { + "epoch": 0.114125, + "grad_norm": 3.4375, + "grad_norm_var": 0.5356730143229167, + "learning_rate": 0.0001, + "loss": 8.5786, + "loss/crossentropy": 2.5109734535217285, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.28478285670280457, + "step": 1826 + }, + { + "epoch": 0.11425, + "grad_norm": 3.46875, + "grad_norm_var": 0.5471995035807292, + "learning_rate": 0.0001, + "loss": 8.6909, + "loss/crossentropy": 2.5854321718215942, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.31594397127628326, + "step": 1828 + }, + { + "epoch": 0.114375, + "grad_norm": 3.21875, + "grad_norm_var": 0.3377349853515625, + "learning_rate": 0.0001, + "loss": 8.5017, + "loss/crossentropy": 2.3423889875411987, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.3003038465976715, + "step": 1830 + }, + { + "epoch": 0.1145, + "grad_norm": 3.578125, + "grad_norm_var": 0.2010406494140625, + "learning_rate": 0.0001, + "loss": 8.8625, + "loss/crossentropy": 2.612884998321533, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.28992408514022827, + "step": 1832 + }, + { + "epoch": 0.114625, + "grad_norm": 3.265625, + "grad_norm_var": 0.026301066080729168, + "learning_rate": 0.0001, + "loss": 8.6717, + "loss/crossentropy": 2.2408013343811035, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.27288930118083954, + "step": 1834 + }, + { + "epoch": 0.11475, + "grad_norm": 3.3125, + "grad_norm_var": 0.023957316080729166, + "learning_rate": 0.0001, + "loss": 8.7519, + "loss/crossentropy": 2.5239880084991455, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.3304894417524338, + "step": 1836 + }, + { + "epoch": 0.114875, + "grad_norm": 3.453125, + "grad_norm_var": 0.017365519205729166, + "learning_rate": 0.0001, + "loss": 8.5288, + "loss/crossentropy": 2.295746088027954, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.28775452077388763, + "step": 1838 + }, + { + "epoch": 0.115, + "grad_norm": 3.421875, + "grad_norm_var": 0.020198567708333334, + "learning_rate": 0.0001, + "loss": 8.5755, + "loss/crossentropy": 2.367674708366394, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2802489101886749, + "step": 1840 + }, + { + "epoch": 0.115125, + "grad_norm": 3.1875, + "grad_norm_var": 0.022777303059895834, + "learning_rate": 0.0001, + "loss": 8.6102, + "loss/crossentropy": 2.387988567352295, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.2998049259185791, + "step": 1842 + }, + { + "epoch": 0.11525, + "grad_norm": 3.0625, + "grad_norm_var": 0.0284576416015625, + "learning_rate": 0.0001, + "loss": 8.4168, + "loss/crossentropy": 2.1790847778320312, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2700785994529724, + "step": 1844 + }, + { + "epoch": 0.115375, + "grad_norm": 3.46875, + "grad_norm_var": 0.024803670247395833, + "learning_rate": 0.0001, + "loss": 8.7029, + "loss/crossentropy": 2.5277167558670044, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.3007604479789734, + "step": 1846 + }, + { + "epoch": 0.1155, + "grad_norm": 3.3125, + "grad_norm_var": 0.021043904622395835, + "learning_rate": 0.0001, + "loss": 8.7105, + "loss/crossentropy": 2.417070746421814, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.30565008521080017, + "step": 1848 + }, + { + "epoch": 0.115625, + "grad_norm": 3.421875, + "grad_norm_var": 0.03267313639322917, + "learning_rate": 0.0001, + "loss": 8.7572, + "loss/crossentropy": 2.581299304962158, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.2796669900417328, + "step": 1850 + }, + { + "epoch": 0.11575, + "grad_norm": 3.46875, + "grad_norm_var": 0.03391825358072917, + "learning_rate": 0.0001, + "loss": 8.7661, + "loss/crossentropy": 2.385110855102539, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.2876836359500885, + "step": 1852 + }, + { + "epoch": 0.115875, + "grad_norm": 3.59375, + "grad_norm_var": 0.036637369791666666, + "learning_rate": 0.0001, + "loss": 9.1625, + "loss/crossentropy": 2.503122568130493, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.3233593702316284, + "step": 1854 + }, + { + "epoch": 0.116, + "grad_norm": 3.453125, + "grad_norm_var": 0.03401590983072917, + "learning_rate": 0.0001, + "loss": 8.964, + "loss/crossentropy": 2.4058728218078613, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.349610835313797, + "step": 1856 + }, + { + "epoch": 0.116125, + "grad_norm": 3.6875, + "grad_norm_var": 0.032796223958333336, + "learning_rate": 0.0001, + "loss": 8.6335, + "loss/crossentropy": 2.472745180130005, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.28807832300662994, + "step": 1858 + }, + { + "epoch": 0.11625, + "grad_norm": 3.171875, + "grad_norm_var": 0.026276652018229166, + "learning_rate": 0.0001, + "loss": 8.6263, + "loss/crossentropy": 2.377658724784851, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2969461679458618, + "step": 1860 + }, + { + "epoch": 0.116375, + "grad_norm": 3.46875, + "grad_norm_var": 0.030744425455729165, + "learning_rate": 0.0001, + "loss": 8.523, + "loss/crossentropy": 2.261406660079956, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2751418948173523, + "step": 1862 + }, + { + "epoch": 0.1165, + "grad_norm": 3.734375, + "grad_norm_var": 0.033722941080729166, + "learning_rate": 0.0001, + "loss": 8.552, + "loss/crossentropy": 2.3539315462112427, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2792647182941437, + "step": 1864 + }, + { + "epoch": 0.116625, + "grad_norm": 3.375, + "grad_norm_var": 0.026949055989583335, + "learning_rate": 0.0001, + "loss": 8.5773, + "loss/crossentropy": 2.5640159845352173, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.2862507700920105, + "step": 1866 + }, + { + "epoch": 0.11675, + "grad_norm": 3.203125, + "grad_norm_var": 0.032013956705729166, + "learning_rate": 0.0001, + "loss": 8.489, + "loss/crossentropy": 2.419552803039551, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.28335337340831757, + "step": 1868 + }, + { + "epoch": 0.116875, + "grad_norm": 3.421875, + "grad_norm_var": 0.029059855143229167, + "learning_rate": 0.0001, + "loss": 8.2858, + "loss/crossentropy": 2.206141471862793, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.2972014546394348, + "step": 1870 + }, + { + "epoch": 0.117, + "grad_norm": 3.5625, + "grad_norm_var": 0.03101806640625, + "learning_rate": 0.0001, + "loss": 8.4301, + "loss/crossentropy": 2.2040516138076782, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.28033290803432465, + "step": 1872 + }, + { + "epoch": 0.117125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0343902587890625, + "learning_rate": 0.0001, + "loss": 8.6147, + "loss/crossentropy": 2.4695621728897095, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.28286705911159515, + "step": 1874 + }, + { + "epoch": 0.11725, + "grad_norm": 3.53125, + "grad_norm_var": 0.03194986979166667, + "learning_rate": 0.0001, + "loss": 8.6246, + "loss/crossentropy": 2.3480403423309326, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.29076308012008667, + "step": 1876 + }, + { + "epoch": 0.117375, + "grad_norm": 3.1875, + "grad_norm_var": 0.033991495768229164, + "learning_rate": 0.0001, + "loss": 8.6359, + "loss/crossentropy": 2.3183737993240356, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.29951538145542145, + "step": 1878 + }, + { + "epoch": 0.1175, + "grad_norm": 3.765625, + "grad_norm_var": 0.03533528645833333, + "learning_rate": 0.0001, + "loss": 8.7296, + "loss/crossentropy": 2.259859085083008, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.28555014729499817, + "step": 1880 + }, + { + "epoch": 0.117625, + "grad_norm": 3.203125, + "grad_norm_var": 0.03573811848958333, + "learning_rate": 0.0001, + "loss": 8.5874, + "loss/crossentropy": 2.5705759525299072, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2966092526912689, + "step": 1882 + }, + { + "epoch": 0.11775, + "grad_norm": 3.546875, + "grad_norm_var": 0.034619140625, + "learning_rate": 0.0001, + "loss": 8.6653, + "loss/crossentropy": 2.324189782142639, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.28070059418678284, + "step": 1884 + }, + { + "epoch": 0.117875, + "grad_norm": 4.09375, + "grad_norm_var": 0.06123046875, + "learning_rate": 0.0001, + "loss": 8.9445, + "loss/crossentropy": 2.6003164052963257, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.42599718272686005, + "step": 1886 + }, + { + "epoch": 0.118, + "grad_norm": 3.359375, + "grad_norm_var": 0.06826070149739584, + "learning_rate": 0.0001, + "loss": 8.4642, + "loss/crossentropy": 2.1944313049316406, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.27223484218120575, + "step": 1888 + }, + { + "epoch": 0.118125, + "grad_norm": 3.328125, + "grad_norm_var": 0.0644683837890625, + "learning_rate": 0.0001, + "loss": 8.6562, + "loss/crossentropy": 2.445297122001648, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.29588577151298523, + "step": 1890 + }, + { + "epoch": 0.11825, + "grad_norm": 3.421875, + "grad_norm_var": 0.08600260416666666, + "learning_rate": 0.0001, + "loss": 8.6856, + "loss/crossentropy": 2.2999027371406555, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2753405198454857, + "step": 1892 + }, + { + "epoch": 0.118375, + "grad_norm": 3.390625, + "grad_norm_var": 0.07831929524739584, + "learning_rate": 0.0001, + "loss": 8.5229, + "loss/crossentropy": 2.399373769760132, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.2927252948284149, + "step": 1894 + }, + { + "epoch": 0.1185, + "grad_norm": 3.203125, + "grad_norm_var": 0.07942606608072916, + "learning_rate": 0.0001, + "loss": 8.5229, + "loss/crossentropy": 2.4299787282943726, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.30517131090164185, + "step": 1896 + }, + { + "epoch": 0.118625, + "grad_norm": 3.453125, + "grad_norm_var": 0.07559305826822917, + "learning_rate": 0.0001, + "loss": 8.4977, + "loss/crossentropy": 1.9518161416053772, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.27373121678829193, + "step": 1898 + }, + { + "epoch": 0.11875, + "grad_norm": 3.734375, + "grad_norm_var": 0.07967020670572916, + "learning_rate": 0.0001, + "loss": 8.4314, + "loss/crossentropy": 2.2160075902938843, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2800438925623894, + "step": 1900 + }, + { + "epoch": 0.118875, + "grad_norm": 3.171875, + "grad_norm_var": 0.052912394205729164, + "learning_rate": 0.0001, + "loss": 8.4264, + "loss/crossentropy": 2.2488516569137573, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2859164774417877, + "step": 1902 + }, + { + "epoch": 0.119, + "grad_norm": 3.390625, + "grad_norm_var": 0.04648030598958333, + "learning_rate": 0.0001, + "loss": 8.7733, + "loss/crossentropy": 2.298749089241028, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.3013346642255783, + "step": 1904 + }, + { + "epoch": 0.119125, + "grad_norm": 3.34375, + "grad_norm_var": 0.04391276041666667, + "learning_rate": 0.0001, + "loss": 8.607, + "loss/crossentropy": 2.5380003452301025, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2967790812253952, + "step": 1906 + }, + { + "epoch": 0.11925, + "grad_norm": 3.828125, + "grad_norm_var": 0.03439127604166667, + "learning_rate": 0.0001, + "loss": 8.673, + "loss/crossentropy": 2.4969903230667114, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.34014879167079926, + "step": 1908 + }, + { + "epoch": 0.119375, + "grad_norm": 3.734375, + "grad_norm_var": 0.04801025390625, + "learning_rate": 0.0001, + "loss": 8.9499, + "loss/crossentropy": 2.326760768890381, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.3319539427757263, + "step": 1910 + }, + { + "epoch": 0.1195, + "grad_norm": 3.46875, + "grad_norm_var": 0.04390869140625, + "learning_rate": 0.0001, + "loss": 8.983, + "loss/crossentropy": 2.3548574447631836, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.3132203370332718, + "step": 1912 + }, + { + "epoch": 0.119625, + "grad_norm": 3.25, + "grad_norm_var": 0.04912109375, + "learning_rate": 0.0001, + "loss": 8.4837, + "loss/crossentropy": 2.301589846611023, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.28266826272010803, + "step": 1914 + }, + { + "epoch": 0.11975, + "grad_norm": 3.3125, + "grad_norm_var": 0.0535797119140625, + "learning_rate": 0.0001, + "loss": 9.0105, + "loss/crossentropy": 2.2812716960906982, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.3110269755125046, + "step": 1916 + }, + { + "epoch": 0.119875, + "grad_norm": 3.65625, + "grad_norm_var": 0.05188700358072917, + "learning_rate": 0.0001, + "loss": 8.7659, + "loss/crossentropy": 2.4299668073654175, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2899664491415024, + "step": 1918 + }, + { + "epoch": 0.12, + "grad_norm": 3.421875, + "grad_norm_var": 0.059403483072916666, + "learning_rate": 0.0001, + "loss": 8.6172, + "loss/crossentropy": 2.2184523940086365, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.2705395221710205, + "step": 1920 + }, + { + "epoch": 0.120125, + "grad_norm": 3.28125, + "grad_norm_var": 0.061742146809895836, + "learning_rate": 0.0001, + "loss": 8.6808, + "loss/crossentropy": 2.361995577812195, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2976345121860504, + "step": 1922 + }, + { + "epoch": 0.12025, + "grad_norm": 3.140625, + "grad_norm_var": 0.06204427083333333, + "learning_rate": 0.0001, + "loss": 8.572, + "loss/crossentropy": 2.0673895478248596, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2614962309598923, + "step": 1924 + }, + { + "epoch": 0.120375, + "grad_norm": 3.265625, + "grad_norm_var": 0.050324503580729166, + "learning_rate": 0.0001, + "loss": 8.3519, + "loss/crossentropy": 2.333189368247986, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2846105396747589, + "step": 1926 + }, + { + "epoch": 0.1205, + "grad_norm": 3.921875, + "grad_norm_var": 0.07100321451822916, + "learning_rate": 0.0001, + "loss": 8.5546, + "loss/crossentropy": 2.3604001998901367, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2743445485830307, + "step": 1928 + }, + { + "epoch": 0.120625, + "grad_norm": 3.15625, + "grad_norm_var": 0.07200113932291667, + "learning_rate": 0.0001, + "loss": 8.4642, + "loss/crossentropy": 2.3226892948150635, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2899642139673233, + "step": 1930 + }, + { + "epoch": 0.12075, + "grad_norm": 3.140625, + "grad_norm_var": 0.06077372233072917, + "learning_rate": 0.0001, + "loss": 8.4469, + "loss/crossentropy": 2.2486273050308228, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.29645949602127075, + "step": 1932 + }, + { + "epoch": 0.120875, + "grad_norm": 3.375, + "grad_norm_var": 0.04622294108072917, + "learning_rate": 0.0001, + "loss": 8.4223, + "loss/crossentropy": 2.590661644935608, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2843063771724701, + "step": 1934 + }, + { + "epoch": 0.121, + "grad_norm": 2.984375, + "grad_norm_var": 0.05113525390625, + "learning_rate": 0.0001, + "loss": 8.5287, + "loss/crossentropy": 2.392248034477234, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.28749269247055054, + "step": 1936 + }, + { + "epoch": 0.121125, + "grad_norm": 3.5, + "grad_norm_var": 0.05465087890625, + "learning_rate": 0.0001, + "loss": 8.4881, + "loss/crossentropy": 2.3735626935958862, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.3381284326314926, + "step": 1938 + }, + { + "epoch": 0.12125, + "grad_norm": 3.953125, + "grad_norm_var": 0.07750244140625, + "learning_rate": 0.0001, + "loss": 8.7097, + "loss/crossentropy": 2.5563061237335205, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.3243313729763031, + "step": 1940 + }, + { + "epoch": 0.121375, + "grad_norm": 3.078125, + "grad_norm_var": 0.0794342041015625, + "learning_rate": 0.0001, + "loss": 8.6438, + "loss/crossentropy": 2.2933984994888306, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.29726622998714447, + "step": 1942 + }, + { + "epoch": 0.1215, + "grad_norm": 3.390625, + "grad_norm_var": 0.06724344889322917, + "learning_rate": 0.0001, + "loss": 8.4159, + "loss/crossentropy": 2.420395255088806, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2696368545293808, + "step": 1944 + }, + { + "epoch": 0.121625, + "grad_norm": 3.15625, + "grad_norm_var": 0.06702067057291666, + "learning_rate": 0.0001, + "loss": 8.5616, + "loss/crossentropy": 2.1465864777565002, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2801681458950043, + "step": 1946 + }, + { + "epoch": 0.12175, + "grad_norm": 3.515625, + "grad_norm_var": 0.11573893229166667, + "learning_rate": 0.0001, + "loss": 8.8623, + "loss/crossentropy": 2.454163372516632, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.27878041565418243, + "step": 1948 + }, + { + "epoch": 0.121875, + "grad_norm": 3.40625, + "grad_norm_var": 0.12135009765625, + "learning_rate": 0.0001, + "loss": 8.2899, + "loss/crossentropy": 2.143743395805359, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2612776756286621, + "step": 1950 + }, + { + "epoch": 0.122, + "grad_norm": 3.71875, + "grad_norm_var": 0.13105367024739584, + "learning_rate": 0.0001, + "loss": 9.0178, + "loss/crossentropy": 2.4097328186035156, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.3114086985588074, + "step": 1952 + }, + { + "epoch": 0.122125, + "grad_norm": 5.15625, + "grad_norm_var": 0.2849355061848958, + "learning_rate": 0.0001, + "loss": 8.7298, + "loss/crossentropy": 2.3883849382400513, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3800114840269089, + "step": 1954 + }, + { + "epoch": 0.12225, + "grad_norm": 3.15625, + "grad_norm_var": 0.31787007649739585, + "learning_rate": 0.0001, + "loss": 8.4569, + "loss/crossentropy": 2.2481584548950195, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.2927142381668091, + "step": 1956 + }, + { + "epoch": 0.122375, + "grad_norm": 3.34375, + "grad_norm_var": 0.3129058837890625, + "learning_rate": 0.0001, + "loss": 8.5422, + "loss/crossentropy": 2.096981406211853, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.27731966972351074, + "step": 1958 + }, + { + "epoch": 0.1225, + "grad_norm": 3.4375, + "grad_norm_var": 0.2894205729166667, + "learning_rate": 0.0001, + "loss": 8.5498, + "loss/crossentropy": 2.531270146369934, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.265569344162941, + "step": 1960 + }, + { + "epoch": 0.122625, + "grad_norm": 3.25, + "grad_norm_var": 0.28873291015625, + "learning_rate": 0.0001, + "loss": 8.521, + "loss/crossentropy": 2.6065129041671753, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.2991243004798889, + "step": 1962 + }, + { + "epoch": 0.12275, + "grad_norm": 3.546875, + "grad_norm_var": 0.2656158447265625, + "learning_rate": 0.0001, + "loss": 8.5729, + "loss/crossentropy": 2.261076807975769, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.28016628324985504, + "step": 1964 + }, + { + "epoch": 0.122875, + "grad_norm": 3.34375, + "grad_norm_var": 0.2662750244140625, + "learning_rate": 0.0001, + "loss": 8.7329, + "loss/crossentropy": 2.4433281421661377, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2746448367834091, + "step": 1966 + }, + { + "epoch": 0.123, + "grad_norm": 3.625, + "grad_norm_var": 0.3149810791015625, + "learning_rate": 0.0001, + "loss": 8.7222, + "loss/crossentropy": 2.2890138626098633, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.29224395751953125, + "step": 1968 + }, + { + "epoch": 0.123125, + "grad_norm": 3.21875, + "grad_norm_var": 0.15416259765625, + "learning_rate": 0.0001, + "loss": 8.3613, + "loss/crossentropy": 2.1031445264816284, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.2901540696620941, + "step": 1970 + }, + { + "epoch": 0.12325, + "grad_norm": 3.453125, + "grad_norm_var": 0.11484273274739583, + "learning_rate": 0.0001, + "loss": 8.4639, + "loss/crossentropy": 2.240355134010315, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.27891701459884644, + "step": 1972 + }, + { + "epoch": 0.123375, + "grad_norm": 3.25, + "grad_norm_var": 0.11433919270833333, + "learning_rate": 0.0001, + "loss": 8.6588, + "loss/crossentropy": 2.526862621307373, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.31067635118961334, + "step": 1974 + }, + { + "epoch": 0.1235, + "grad_norm": 3.078125, + "grad_norm_var": 0.12138570149739583, + "learning_rate": 0.0001, + "loss": 8.4162, + "loss/crossentropy": 2.389139413833618, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.3020573705434799, + "step": 1976 + }, + { + "epoch": 0.123625, + "grad_norm": 3.59375, + "grad_norm_var": 0.12506103515625, + "learning_rate": 0.0001, + "loss": 8.4219, + "loss/crossentropy": 2.438482403755188, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.26280972361564636, + "step": 1978 + }, + { + "epoch": 0.12375, + "grad_norm": 3.21875, + "grad_norm_var": 0.12905985514322918, + "learning_rate": 0.0001, + "loss": 8.7076, + "loss/crossentropy": 2.342813014984131, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2951369285583496, + "step": 1980 + }, + { + "epoch": 0.123875, + "grad_norm": 3.296875, + "grad_norm_var": 0.13946024576822916, + "learning_rate": 0.0001, + "loss": 8.4449, + "loss/crossentropy": 2.1303473114967346, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26047107577323914, + "step": 1982 + }, + { + "epoch": 0.124, + "grad_norm": 3.140625, + "grad_norm_var": 0.030985514322916668, + "learning_rate": 0.0001, + "loss": 8.5657, + "loss/crossentropy": 2.470989942550659, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.289617583155632, + "step": 1984 + }, + { + "epoch": 0.124125, + "grad_norm": 3.59375, + "grad_norm_var": 0.03736572265625, + "learning_rate": 0.0001, + "loss": 8.541, + "loss/crossentropy": 2.4312928915023804, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2940659523010254, + "step": 1986 + }, + { + "epoch": 0.12425, + "grad_norm": 3.109375, + "grad_norm_var": 0.0355865478515625, + "learning_rate": 0.0001, + "loss": 8.5967, + "loss/crossentropy": 2.408700704574585, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.29635827243328094, + "step": 1988 + }, + { + "epoch": 0.124375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0367095947265625, + "learning_rate": 0.0001, + "loss": 8.5236, + "loss/crossentropy": 2.222353756427765, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.2752675265073776, + "step": 1990 + }, + { + "epoch": 0.1245, + "grad_norm": 3.390625, + "grad_norm_var": 0.03466796875, + "learning_rate": 0.0001, + "loss": 8.7415, + "loss/crossentropy": 2.5249141454696655, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.31805843114852905, + "step": 1992 + }, + { + "epoch": 0.124625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0289947509765625, + "learning_rate": 0.0001, + "loss": 8.3748, + "loss/crossentropy": 1.9848942756652832, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2688567340373993, + "step": 1994 + }, + { + "epoch": 0.12475, + "grad_norm": 3.453125, + "grad_norm_var": 0.02486572265625, + "learning_rate": 0.0001, + "loss": 8.8413, + "loss/crossentropy": 2.657674193382263, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.27856191992759705, + "step": 1996 + }, + { + "epoch": 0.124875, + "grad_norm": 3.171875, + "grad_norm_var": 0.023688761393229167, + "learning_rate": 0.0001, + "loss": 8.3386, + "loss/crossentropy": 2.289615035057068, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.27829277515411377, + "step": 1998 + }, + { + "epoch": 0.125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0205230712890625, + "learning_rate": 0.0001, + "loss": 8.5863, + "loss/crossentropy": 2.5303882360458374, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2958512306213379, + "step": 2000 + }, + { + "epoch": 0.125125, + "grad_norm": 2.921875, + "grad_norm_var": 0.025028483072916666, + "learning_rate": 0.0001, + "loss": 8.3451, + "loss/crossentropy": 2.200709104537964, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.26912133395671844, + "step": 2002 + }, + { + "epoch": 0.12525, + "grad_norm": 3.203125, + "grad_norm_var": 0.0232330322265625, + "learning_rate": 0.0001, + "loss": 8.3675, + "loss/crossentropy": 2.523932099342346, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.292253702878952, + "step": 2004 + }, + { + "epoch": 0.125375, + "grad_norm": 2.921875, + "grad_norm_var": 0.029662068684895834, + "learning_rate": 0.0001, + "loss": 8.4154, + "loss/crossentropy": 2.3884671926498413, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.28686901926994324, + "step": 2006 + }, + { + "epoch": 0.1255, + "grad_norm": 3.34375, + "grad_norm_var": 0.032633463541666664, + "learning_rate": 0.0001, + "loss": 8.4873, + "loss/crossentropy": 2.492961883544922, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.2938031554222107, + "step": 2008 + }, + { + "epoch": 0.125625, + "grad_norm": 3.46875, + "grad_norm_var": 0.04558919270833333, + "learning_rate": 0.0001, + "loss": 8.4653, + "loss/crossentropy": 2.1341389417648315, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2501054108142853, + "step": 2010 + }, + { + "epoch": 0.12575, + "grad_norm": 3.390625, + "grad_norm_var": 0.043919881184895836, + "learning_rate": 0.0001, + "loss": 8.6647, + "loss/crossentropy": 2.3359371423721313, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2804300785064697, + "step": 2012 + }, + { + "epoch": 0.125875, + "grad_norm": 3.296875, + "grad_norm_var": 0.04475809733072917, + "learning_rate": 0.0001, + "loss": 8.6907, + "loss/crossentropy": 2.5302098989486694, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.28145162761211395, + "step": 2014 + }, + { + "epoch": 0.126, + "grad_norm": 3.234375, + "grad_norm_var": 0.04456278483072917, + "learning_rate": 0.0001, + "loss": 8.6119, + "loss/crossentropy": 2.437941551208496, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.26573941111564636, + "step": 2016 + }, + { + "epoch": 0.126125, + "grad_norm": 3.40625, + "grad_norm_var": 0.035676066080729166, + "learning_rate": 0.0001, + "loss": 8.6258, + "loss/crossentropy": 2.6573965549468994, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.29143331944942474, + "step": 2018 + }, + { + "epoch": 0.12625, + "grad_norm": 3.34375, + "grad_norm_var": 0.03544921875, + "learning_rate": 0.0001, + "loss": 8.4249, + "loss/crossentropy": 2.4629688262939453, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.28354427218437195, + "step": 2020 + }, + { + "epoch": 0.126375, + "grad_norm": 4.21875, + "grad_norm_var": 0.0703277587890625, + "learning_rate": 0.0001, + "loss": 8.8307, + "loss/crossentropy": 2.252517580986023, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.3386313170194626, + "step": 2022 + }, + { + "epoch": 0.1265, + "grad_norm": 3.34375, + "grad_norm_var": 0.06189676920572917, + "learning_rate": 0.0001, + "loss": 8.2863, + "loss/crossentropy": 2.348217487335205, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.24487578123807907, + "step": 2024 + }, + { + "epoch": 0.126625, + "grad_norm": 3.234375, + "grad_norm_var": 0.057779947916666664, + "learning_rate": 0.0001, + "loss": 8.3008, + "loss/crossentropy": 2.2095032930374146, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2705332636833191, + "step": 2026 + }, + { + "epoch": 0.12675, + "grad_norm": 3.15625, + "grad_norm_var": 0.06265869140625, + "learning_rate": 0.0001, + "loss": 8.4107, + "loss/crossentropy": 2.167417824268341, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.27016985416412354, + "step": 2028 + }, + { + "epoch": 0.126875, + "grad_norm": 3.234375, + "grad_norm_var": 0.06298726399739583, + "learning_rate": 0.0001, + "loss": 8.4653, + "loss/crossentropy": 2.316556692123413, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2680739611387253, + "step": 2030 + }, + { + "epoch": 0.127, + "grad_norm": 3.0625, + "grad_norm_var": 0.06646728515625, + "learning_rate": 0.0001, + "loss": 8.4679, + "loss/crossentropy": 2.220218300819397, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2792097181081772, + "step": 2032 + }, + { + "epoch": 0.127125, + "grad_norm": 3.3125, + "grad_norm_var": 0.06718343098958333, + "learning_rate": 0.0001, + "loss": 8.4241, + "loss/crossentropy": 2.2215596437454224, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2561402767896652, + "step": 2034 + }, + { + "epoch": 0.12725, + "grad_norm": 3.109375, + "grad_norm_var": 0.0806060791015625, + "learning_rate": 0.0001, + "loss": 8.1138, + "loss/crossentropy": 2.2149853706359863, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.26566116511821747, + "step": 2036 + }, + { + "epoch": 0.127375, + "grad_norm": 3.15625, + "grad_norm_var": 0.020148722330729167, + "learning_rate": 0.0001, + "loss": 8.4906, + "loss/crossentropy": 2.054360866546631, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.31411731243133545, + "step": 2038 + }, + { + "epoch": 0.1275, + "grad_norm": 3.71875, + "grad_norm_var": 0.04202473958333333, + "learning_rate": 0.0001, + "loss": 8.3689, + "loss/crossentropy": 2.3898195028305054, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.25973349064588547, + "step": 2040 + }, + { + "epoch": 0.127625, + "grad_norm": 3.171875, + "grad_norm_var": 0.04273173014322917, + "learning_rate": 0.0001, + "loss": 8.4614, + "loss/crossentropy": 2.2964980602264404, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2612931877374649, + "step": 2042 + }, + { + "epoch": 0.12775, + "grad_norm": 3.1875, + "grad_norm_var": 0.0513092041015625, + "learning_rate": 0.0001, + "loss": 8.434, + "loss/crossentropy": 2.58343243598938, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.26787108182907104, + "step": 2044 + }, + { + "epoch": 0.127875, + "grad_norm": 3.25, + "grad_norm_var": 0.046708170572916666, + "learning_rate": 0.0001, + "loss": 8.4918, + "loss/crossentropy": 2.24523389339447, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2874600887298584, + "step": 2046 + }, + { + "epoch": 0.128, + "grad_norm": 3.453125, + "grad_norm_var": 0.04885660807291667, + "learning_rate": 0.0001, + "loss": 8.6684, + "loss/crossentropy": 2.2914642095565796, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.3138655722141266, + "step": 2048 + }, + { + "epoch": 0.128125, + "grad_norm": 3.25, + "grad_norm_var": 0.0490386962890625, + "learning_rate": 0.0001, + "loss": 8.5455, + "loss/crossentropy": 2.5149351358413696, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.29720842838287354, + "step": 2050 + }, + { + "epoch": 0.12825, + "grad_norm": 3.25, + "grad_norm_var": 0.04035542805989583, + "learning_rate": 0.0001, + "loss": 8.3206, + "loss/crossentropy": 2.1453710794448853, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.23647580295801163, + "step": 2052 + }, + { + "epoch": 0.128375, + "grad_norm": 3.09375, + "grad_norm_var": 0.053831990559895834, + "learning_rate": 0.0001, + "loss": 8.2585, + "loss/crossentropy": 2.246112108230591, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.26230376958847046, + "step": 2054 + }, + { + "epoch": 0.1285, + "grad_norm": 3.203125, + "grad_norm_var": 0.028238932291666668, + "learning_rate": 0.0001, + "loss": 8.3248, + "loss/crossentropy": 2.412488341331482, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.26848049461841583, + "step": 2056 + }, + { + "epoch": 0.128625, + "grad_norm": 3.09375, + "grad_norm_var": 0.027228800455729167, + "learning_rate": 0.0001, + "loss": 8.6028, + "loss/crossentropy": 2.4104580879211426, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.27841897308826447, + "step": 2058 + }, + { + "epoch": 0.12875, + "grad_norm": 3.109375, + "grad_norm_var": 0.025007120768229165, + "learning_rate": 0.0001, + "loss": 8.6475, + "loss/crossentropy": 2.443092107772827, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.27504517138004303, + "step": 2060 + }, + { + "epoch": 0.128875, + "grad_norm": 3.234375, + "grad_norm_var": 0.025812784830729168, + "learning_rate": 0.0001, + "loss": 8.5933, + "loss/crossentropy": 2.401219129562378, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.29284021258354187, + "step": 2062 + }, + { + "epoch": 0.129, + "grad_norm": 3.125, + "grad_norm_var": 0.031208292643229166, + "learning_rate": 0.0001, + "loss": 8.4635, + "loss/crossentropy": 2.2347441911697388, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.27816687524318695, + "step": 2064 + }, + { + "epoch": 0.129125, + "grad_norm": 3.375, + "grad_norm_var": 0.03675028483072917, + "learning_rate": 0.0001, + "loss": 8.4876, + "loss/crossentropy": 2.444359302520752, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.28021493554115295, + "step": 2066 + }, + { + "epoch": 0.12925, + "grad_norm": 3.25, + "grad_norm_var": 0.03655192057291667, + "learning_rate": 0.0001, + "loss": 8.4261, + "loss/crossentropy": 2.32525098323822, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2960277646780014, + "step": 2068 + }, + { + "epoch": 0.129375, + "grad_norm": 3.296875, + "grad_norm_var": 0.023323567708333333, + "learning_rate": 0.0001, + "loss": 8.4241, + "loss/crossentropy": 2.489911675453186, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2791932672262192, + "step": 2070 + }, + { + "epoch": 0.1295, + "grad_norm": 3.53125, + "grad_norm_var": 0.026981608072916666, + "learning_rate": 0.0001, + "loss": 8.5868, + "loss/crossentropy": 2.3564376831054688, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2700600177049637, + "step": 2072 + }, + { + "epoch": 0.129625, + "grad_norm": 3.125, + "grad_norm_var": 0.026009114583333333, + "learning_rate": 0.0001, + "loss": 8.4257, + "loss/crossentropy": 2.5643441677093506, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.26559390127658844, + "step": 2074 + }, + { + "epoch": 0.12975, + "grad_norm": 3.375, + "grad_norm_var": 0.02750244140625, + "learning_rate": 0.0001, + "loss": 8.5631, + "loss/crossentropy": 2.390055775642395, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2720278650522232, + "step": 2076 + }, + { + "epoch": 0.129875, + "grad_norm": 3.09375, + "grad_norm_var": 0.026610310872395834, + "learning_rate": 0.0001, + "loss": 8.1939, + "loss/crossentropy": 2.241236686706543, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2665587216615677, + "step": 2078 + }, + { + "epoch": 0.13, + "grad_norm": 3.09375, + "grad_norm_var": 0.021516927083333335, + "learning_rate": 0.0001, + "loss": 8.3988, + "loss/crossentropy": 2.282191276550293, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.26526105403900146, + "step": 2080 + }, + { + "epoch": 0.130125, + "grad_norm": 3.484375, + "grad_norm_var": 0.021773274739583334, + "learning_rate": 0.0001, + "loss": 8.5927, + "loss/crossentropy": 2.325153946876526, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2865811884403229, + "step": 2082 + }, + { + "epoch": 0.13025, + "grad_norm": 3.265625, + "grad_norm_var": 0.021870930989583332, + "learning_rate": 0.0001, + "loss": 8.3854, + "loss/crossentropy": 2.3805923461914062, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.26753927767276764, + "step": 2084 + }, + { + "epoch": 0.130375, + "grad_norm": 3.453125, + "grad_norm_var": 0.030301920572916665, + "learning_rate": 0.0001, + "loss": 8.4407, + "loss/crossentropy": 2.314916491508484, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27673472464084625, + "step": 2086 + }, + { + "epoch": 0.1305, + "grad_norm": 3.234375, + "grad_norm_var": 0.025699869791666666, + "learning_rate": 0.0001, + "loss": 8.4033, + "loss/crossentropy": 2.416364073753357, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.27838364243507385, + "step": 2088 + }, + { + "epoch": 0.130625, + "grad_norm": 3.421875, + "grad_norm_var": 0.027074178059895832, + "learning_rate": 0.0001, + "loss": 8.3806, + "loss/crossentropy": 2.4156532287597656, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2752366364002228, + "step": 2090 + }, + { + "epoch": 0.13075, + "grad_norm": 3.375, + "grad_norm_var": 0.025288899739583332, + "learning_rate": 0.0001, + "loss": 8.6506, + "loss/crossentropy": 2.5105860233306885, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.28529515862464905, + "step": 2092 + }, + { + "epoch": 0.130875, + "grad_norm": 3.296875, + "grad_norm_var": 0.020458984375, + "learning_rate": 0.0001, + "loss": 8.613, + "loss/crossentropy": 2.4152865409851074, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.304058313369751, + "step": 2094 + }, + { + "epoch": 0.131, + "grad_norm": 3.28125, + "grad_norm_var": 0.0170318603515625, + "learning_rate": 0.0001, + "loss": 8.5973, + "loss/crossentropy": 2.3700071573257446, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2684682756662369, + "step": 2096 + }, + { + "epoch": 0.131125, + "grad_norm": 3.171875, + "grad_norm_var": 0.015120442708333333, + "learning_rate": 0.0001, + "loss": 8.4983, + "loss/crossentropy": 2.305688500404358, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.28055354952812195, + "step": 2098 + }, + { + "epoch": 0.13125, + "grad_norm": 3.09375, + "grad_norm_var": 0.018830362955729166, + "learning_rate": 0.0001, + "loss": 8.3541, + "loss/crossentropy": 2.0869110226631165, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2802015244960785, + "step": 2100 + }, + { + "epoch": 0.131375, + "grad_norm": 3.109375, + "grad_norm_var": 0.015999348958333333, + "learning_rate": 0.0001, + "loss": 8.2526, + "loss/crossentropy": 2.242175340652466, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2823330760002136, + "step": 2102 + }, + { + "epoch": 0.1315, + "grad_norm": 3.53125, + "grad_norm_var": 0.020384724934895834, + "learning_rate": 0.0001, + "loss": 8.2892, + "loss/crossentropy": 2.3229693174362183, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.2636559307575226, + "step": 2104 + }, + { + "epoch": 0.131625, + "grad_norm": 3.34375, + "grad_norm_var": 0.019831339518229168, + "learning_rate": 0.0001, + "loss": 8.4317, + "loss/crossentropy": 2.5321102142333984, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.2901579737663269, + "step": 2106 + }, + { + "epoch": 0.13175, + "grad_norm": 3.265625, + "grad_norm_var": 0.01666259765625, + "learning_rate": 0.0001, + "loss": 8.4445, + "loss/crossentropy": 2.397943615913391, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2635400742292404, + "step": 2108 + }, + { + "epoch": 0.131875, + "grad_norm": 3.546875, + "grad_norm_var": 0.021654256184895835, + "learning_rate": 0.0001, + "loss": 8.4828, + "loss/crossentropy": 2.489462733268738, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2792319655418396, + "step": 2110 + }, + { + "epoch": 0.132, + "grad_norm": 3.375, + "grad_norm_var": 0.022289021809895834, + "learning_rate": 0.0001, + "loss": 8.5521, + "loss/crossentropy": 2.3110828399658203, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.30214357376098633, + "step": 2112 + }, + { + "epoch": 0.132125, + "grad_norm": 3.375, + "grad_norm_var": 0.022037760416666666, + "learning_rate": 0.0001, + "loss": 8.5841, + "loss/crossentropy": 2.563341736793518, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.2941661477088928, + "step": 2114 + }, + { + "epoch": 0.13225, + "grad_norm": 3.078125, + "grad_norm_var": 0.022459920247395834, + "learning_rate": 0.0001, + "loss": 8.3364, + "loss/crossentropy": 2.4198527336120605, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2666160762310028, + "step": 2116 + }, + { + "epoch": 0.132375, + "grad_norm": 3.09375, + "grad_norm_var": 0.020563761393229168, + "learning_rate": 0.0001, + "loss": 8.2778, + "loss/crossentropy": 2.285884737968445, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.27266763150691986, + "step": 2118 + }, + { + "epoch": 0.1325, + "grad_norm": 3.25, + "grad_norm_var": 0.016988118489583332, + "learning_rate": 0.0001, + "loss": 8.5911, + "loss/crossentropy": 2.3701746463775635, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.29898887872695923, + "step": 2120 + }, + { + "epoch": 0.132625, + "grad_norm": 3.296875, + "grad_norm_var": 0.015062459309895833, + "learning_rate": 0.0001, + "loss": 8.658, + "loss/crossentropy": 2.4369832277297974, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.27174198627471924, + "step": 2122 + }, + { + "epoch": 0.13275, + "grad_norm": 3.578125, + "grad_norm_var": 0.022785441080729166, + "learning_rate": 0.0001, + "loss": 8.5709, + "loss/crossentropy": 2.3035892248153687, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2923412248492241, + "step": 2124 + }, + { + "epoch": 0.132875, + "grad_norm": 3.0625, + "grad_norm_var": 0.02310791015625, + "learning_rate": 0.0001, + "loss": 8.1847, + "loss/crossentropy": 2.384789824485779, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.27344001829624176, + "step": 2126 + }, + { + "epoch": 0.133, + "grad_norm": 3.0625, + "grad_norm_var": 0.024242146809895834, + "learning_rate": 0.0001, + "loss": 8.306, + "loss/crossentropy": 2.2693413496017456, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2933850884437561, + "step": 2128 + }, + { + "epoch": 0.133125, + "grad_norm": 3.265625, + "grad_norm_var": 0.023193359375, + "learning_rate": 0.0001, + "loss": 8.5839, + "loss/crossentropy": 2.287759780883789, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2801215946674347, + "step": 2130 + }, + { + "epoch": 0.13325, + "grad_norm": 3.34375, + "grad_norm_var": 0.021825154622395832, + "learning_rate": 0.0001, + "loss": 8.7328, + "loss/crossentropy": 2.274720072746277, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.31832873821258545, + "step": 2132 + }, + { + "epoch": 0.133375, + "grad_norm": 3.265625, + "grad_norm_var": 0.02047119140625, + "learning_rate": 0.0001, + "loss": 8.4415, + "loss/crossentropy": 2.5326437950134277, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2927217036485672, + "step": 2134 + }, + { + "epoch": 0.1335, + "grad_norm": 3.546875, + "grad_norm_var": 0.025422159830729166, + "learning_rate": 0.0001, + "loss": 8.4062, + "loss/crossentropy": 2.4775502681732178, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.28668080270290375, + "step": 2136 + }, + { + "epoch": 0.133625, + "grad_norm": 2.96875, + "grad_norm_var": 0.03284505208333333, + "learning_rate": 0.0001, + "loss": 8.4527, + "loss/crossentropy": 2.2728978395462036, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2754566967487335, + "step": 2138 + }, + { + "epoch": 0.13375, + "grad_norm": 3.140625, + "grad_norm_var": 0.024095662434895835, + "learning_rate": 0.0001, + "loss": 8.6044, + "loss/crossentropy": 2.3857744932174683, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2717433273792267, + "step": 2140 + }, + { + "epoch": 0.133875, + "grad_norm": 3.3125, + "grad_norm_var": 0.022175089518229166, + "learning_rate": 0.0001, + "loss": 8.5458, + "loss/crossentropy": 2.1756142377853394, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27554982900619507, + "step": 2142 + }, + { + "epoch": 0.134, + "grad_norm": 3.078125, + "grad_norm_var": 0.021971638997395834, + "learning_rate": 0.0001, + "loss": 8.3312, + "loss/crossentropy": 2.3074915409088135, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2622702121734619, + "step": 2144 + }, + { + "epoch": 0.134125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0297271728515625, + "learning_rate": 0.0001, + "loss": 8.2721, + "loss/crossentropy": 2.320749521255493, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.28162428736686707, + "step": 2146 + }, + { + "epoch": 0.13425, + "grad_norm": 3.046875, + "grad_norm_var": 0.031061808268229168, + "learning_rate": 0.0001, + "loss": 8.3269, + "loss/crossentropy": 2.009217321872711, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2450430542230606, + "step": 2148 + }, + { + "epoch": 0.134375, + "grad_norm": 3.3125, + "grad_norm_var": 0.031037394205729166, + "learning_rate": 0.0001, + "loss": 8.4999, + "loss/crossentropy": 2.278393268585205, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2750803083181381, + "step": 2150 + }, + { + "epoch": 0.1345, + "grad_norm": 3.265625, + "grad_norm_var": 0.02255859375, + "learning_rate": 0.0001, + "loss": 8.4054, + "loss/crossentropy": 2.2876728773117065, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.281391441822052, + "step": 2152 + }, + { + "epoch": 0.134625, + "grad_norm": 3.296875, + "grad_norm_var": 0.016682942708333332, + "learning_rate": 0.0001, + "loss": 8.2786, + "loss/crossentropy": 2.163739323616028, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2533603757619858, + "step": 2154 + }, + { + "epoch": 0.13475, + "grad_norm": 3.0625, + "grad_norm_var": 0.0157867431640625, + "learning_rate": 0.0001, + "loss": 8.3023, + "loss/crossentropy": 2.3555957078933716, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.3196130394935608, + "step": 2156 + }, + { + "epoch": 0.134875, + "grad_norm": 3.140625, + "grad_norm_var": 0.0141754150390625, + "learning_rate": 0.0001, + "loss": 8.4222, + "loss/crossentropy": 2.389632821083069, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.2907683253288269, + "step": 2158 + }, + { + "epoch": 0.135, + "grad_norm": 2.9375, + "grad_norm_var": 0.015576171875, + "learning_rate": 0.0001, + "loss": 8.2051, + "loss/crossentropy": 2.140980839729309, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.274806946516037, + "step": 2160 + }, + { + "epoch": 0.135125, + "grad_norm": 3.375, + "grad_norm_var": 0.016227213541666667, + "learning_rate": 0.0001, + "loss": 8.4341, + "loss/crossentropy": 2.298740863800049, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.30332574248313904, + "step": 2162 + }, + { + "epoch": 0.13525, + "grad_norm": 3.390625, + "grad_norm_var": 0.0182281494140625, + "learning_rate": 0.0001, + "loss": 8.267, + "loss/crossentropy": 2.3586976528167725, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2925044894218445, + "step": 2164 + }, + { + "epoch": 0.135375, + "grad_norm": 3.09375, + "grad_norm_var": 0.014676920572916667, + "learning_rate": 0.0001, + "loss": 8.4944, + "loss/crossentropy": 2.4937496185302734, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.28084906935691833, + "step": 2166 + }, + { + "epoch": 0.1355, + "grad_norm": 3.21875, + "grad_norm_var": 0.014778645833333333, + "learning_rate": 0.0001, + "loss": 8.3792, + "loss/crossentropy": 2.2324042320251465, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.26973237097263336, + "step": 2168 + }, + { + "epoch": 0.135625, + "grad_norm": 3.359375, + "grad_norm_var": 0.01695556640625, + "learning_rate": 0.0001, + "loss": 8.3303, + "loss/crossentropy": 2.3384079933166504, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.2514626085758209, + "step": 2170 + }, + { + "epoch": 0.13575, + "grad_norm": 3.109375, + "grad_norm_var": 0.01636962890625, + "learning_rate": 0.0001, + "loss": 8.2997, + "loss/crossentropy": 2.2900805473327637, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.287667915225029, + "step": 2172 + }, + { + "epoch": 0.135875, + "grad_norm": 4.4375, + "grad_norm_var": 0.113818359375, + "learning_rate": 0.0001, + "loss": 8.6838, + "loss/crossentropy": 2.3386855125427246, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.28304457664489746, + "step": 2174 + }, + { + "epoch": 0.136, + "grad_norm": 3.328125, + "grad_norm_var": 0.10152587890625, + "learning_rate": 0.0001, + "loss": 8.6361, + "loss/crossentropy": 2.4590978622436523, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.3032967746257782, + "step": 2176 + }, + { + "epoch": 0.136125, + "grad_norm": 3.640625, + "grad_norm_var": 0.10586649576822917, + "learning_rate": 0.0001, + "loss": 8.524, + "loss/crossentropy": 2.4377013444900513, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2948048859834671, + "step": 2178 + }, + { + "epoch": 0.13625, + "grad_norm": 3.5, + "grad_norm_var": 0.11148173014322917, + "learning_rate": 0.0001, + "loss": 8.427, + "loss/crossentropy": 2.3763121366500854, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2737642973661423, + "step": 2180 + }, + { + "epoch": 0.136375, + "grad_norm": 3.46875, + "grad_norm_var": 0.10756734212239584, + "learning_rate": 0.0001, + "loss": 8.7061, + "loss/crossentropy": 2.3559194803237915, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.32232511043548584, + "step": 2182 + }, + { + "epoch": 0.1365, + "grad_norm": 3.21875, + "grad_norm_var": 0.10926106770833334, + "learning_rate": 0.0001, + "loss": 8.3743, + "loss/crossentropy": 2.3787566423416138, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.266545370221138, + "step": 2184 + }, + { + "epoch": 0.136625, + "grad_norm": 3.03125, + "grad_norm_var": 0.1169830322265625, + "learning_rate": 0.0001, + "loss": 8.5617, + "loss/crossentropy": 2.4981011152267456, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.30071987211704254, + "step": 2186 + }, + { + "epoch": 0.13675, + "grad_norm": 3.71875, + "grad_norm_var": 0.91328125, + "learning_rate": 0.0001, + "loss": 8.6439, + "loss/crossentropy": 2.4121392965316772, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.28945305943489075, + "step": 2188 + }, + { + "epoch": 0.136875, + "grad_norm": 3.34375, + "grad_norm_var": 0.8569498697916667, + "learning_rate": 0.0001, + "loss": 8.6653, + "loss/crossentropy": 2.5263431072235107, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.3138289451599121, + "step": 2190 + }, + { + "epoch": 0.137, + "grad_norm": 3.421875, + "grad_norm_var": 0.8448404947916667, + "learning_rate": 0.0001, + "loss": 8.6425, + "loss/crossentropy": 2.3461450338363647, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.2797774076461792, + "step": 2192 + }, + { + "epoch": 0.137125, + "grad_norm": 3.125, + "grad_norm_var": 0.8635813395182291, + "learning_rate": 0.0001, + "loss": 8.7482, + "loss/crossentropy": 2.562302827835083, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.2883787453174591, + "step": 2194 + }, + { + "epoch": 0.13725, + "grad_norm": 3.671875, + "grad_norm_var": 0.8723917643229167, + "learning_rate": 0.0001, + "loss": 8.5541, + "loss/crossentropy": 2.392000913619995, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.28321924805641174, + "step": 2196 + }, + { + "epoch": 0.137375, + "grad_norm": 3.09375, + "grad_norm_var": 0.8815592447916667, + "learning_rate": 0.0001, + "loss": 8.2531, + "loss/crossentropy": 2.250584840774536, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2665387690067291, + "step": 2198 + }, + { + "epoch": 0.1375, + "grad_norm": 3.484375, + "grad_norm_var": 0.8637003580729167, + "learning_rate": 0.0001, + "loss": 8.2037, + "loss/crossentropy": 2.2912397384643555, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2549472153186798, + "step": 2200 + }, + { + "epoch": 0.137625, + "grad_norm": 3.40625, + "grad_norm_var": 0.8500803629557292, + "learning_rate": 0.0001, + "loss": 8.3596, + "loss/crossentropy": 2.470545172691345, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.29853616654872894, + "step": 2202 + }, + { + "epoch": 0.13775, + "grad_norm": 3.140625, + "grad_norm_var": 0.0280670166015625, + "learning_rate": 0.0001, + "loss": 8.4787, + "loss/crossentropy": 2.1322121024131775, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2622481659054756, + "step": 2204 + }, + { + "epoch": 0.137875, + "grad_norm": 3.28125, + "grad_norm_var": 0.026740519205729167, + "learning_rate": 0.0001, + "loss": 8.466, + "loss/crossentropy": 2.340916156768799, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2863175719976425, + "step": 2206 + }, + { + "epoch": 0.138, + "grad_norm": 3.125, + "grad_norm_var": 0.03297119140625, + "learning_rate": 0.0001, + "loss": 8.4187, + "loss/crossentropy": 2.397621750831604, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2743557095527649, + "step": 2208 + }, + { + "epoch": 0.138125, + "grad_norm": 3.0, + "grad_norm_var": 0.03684794108072917, + "learning_rate": 0.0001, + "loss": 8.3557, + "loss/crossentropy": 2.2214730978012085, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.3140524923801422, + "step": 2210 + }, + { + "epoch": 0.13825, + "grad_norm": 3.515625, + "grad_norm_var": 0.029759724934895832, + "learning_rate": 0.0001, + "loss": 8.4706, + "loss/crossentropy": 2.3356775045394897, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.28506164252758026, + "step": 2212 + }, + { + "epoch": 0.138375, + "grad_norm": 3.09375, + "grad_norm_var": 0.030061848958333335, + "learning_rate": 0.0001, + "loss": 8.5412, + "loss/crossentropy": 2.2626901865005493, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2693404108285904, + "step": 2214 + }, + { + "epoch": 0.1385, + "grad_norm": 3.171875, + "grad_norm_var": 0.0215728759765625, + "learning_rate": 0.0001, + "loss": 8.3191, + "loss/crossentropy": 2.1286741495132446, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2530980408191681, + "step": 2216 + }, + { + "epoch": 0.138625, + "grad_norm": 3.21875, + "grad_norm_var": 0.021565755208333332, + "learning_rate": 0.0001, + "loss": 8.6178, + "loss/crossentropy": 2.3509132862091064, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2691505402326584, + "step": 2218 + }, + { + "epoch": 0.13875, + "grad_norm": 3.46875, + "grad_norm_var": 0.0286529541015625, + "learning_rate": 0.0001, + "loss": 8.6084, + "loss/crossentropy": 2.3187073469161987, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.2853757441043854, + "step": 2220 + }, + { + "epoch": 0.138875, + "grad_norm": 3.171875, + "grad_norm_var": 0.030882771809895834, + "learning_rate": 0.0001, + "loss": 8.5621, + "loss/crossentropy": 2.4514535665512085, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2782391607761383, + "step": 2222 + }, + { + "epoch": 0.139, + "grad_norm": 3.25, + "grad_norm_var": 0.0228912353515625, + "learning_rate": 0.0001, + "loss": 8.3712, + "loss/crossentropy": 2.509943962097168, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2838071286678314, + "step": 2224 + }, + { + "epoch": 0.139125, + "grad_norm": 3.234375, + "grad_norm_var": 0.01640625, + "learning_rate": 0.0001, + "loss": 8.5852, + "loss/crossentropy": 2.2804245948791504, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.28991344571113586, + "step": 2226 + }, + { + "epoch": 0.13925, + "grad_norm": 3.796875, + "grad_norm_var": 0.0291412353515625, + "learning_rate": 0.0001, + "loss": 8.7108, + "loss/crossentropy": 2.453079104423523, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2956879884004593, + "step": 2228 + }, + { + "epoch": 0.139375, + "grad_norm": 3.296875, + "grad_norm_var": 0.02652587890625, + "learning_rate": 0.0001, + "loss": 8.6285, + "loss/crossentropy": 2.4368172883987427, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.3047195374965668, + "step": 2230 + }, + { + "epoch": 0.1395, + "grad_norm": 3.140625, + "grad_norm_var": 0.0262359619140625, + "learning_rate": 0.0001, + "loss": 8.4539, + "loss/crossentropy": 2.306997299194336, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.251560240983963, + "step": 2232 + }, + { + "epoch": 0.139625, + "grad_norm": 3.03125, + "grad_norm_var": 0.042041015625, + "learning_rate": 0.0001, + "loss": 8.424, + "loss/crossentropy": 2.139360785484314, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.28243619203567505, + "step": 2234 + }, + { + "epoch": 0.13975, + "grad_norm": 3.171875, + "grad_norm_var": 0.03759358723958333, + "learning_rate": 0.0001, + "loss": 8.2889, + "loss/crossentropy": 2.2428722381591797, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2649083435535431, + "step": 2236 + }, + { + "epoch": 0.139875, + "grad_norm": 3.171875, + "grad_norm_var": 0.03997395833333333, + "learning_rate": 0.0001, + "loss": 8.3299, + "loss/crossentropy": 2.3508530855178833, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2734558805823326, + "step": 2238 + }, + { + "epoch": 0.14, + "grad_norm": 3.15625, + "grad_norm_var": 0.042267862955729166, + "learning_rate": 0.0001, + "loss": 8.1833, + "loss/crossentropy": 2.3940563201904297, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26705045998096466, + "step": 2240 + }, + { + "epoch": 0.140125, + "grad_norm": 3.359375, + "grad_norm_var": 0.0432037353515625, + "learning_rate": 0.0001, + "loss": 8.4273, + "loss/crossentropy": 2.294648766517639, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2624429166316986, + "step": 2242 + }, + { + "epoch": 0.14025, + "grad_norm": 3.21875, + "grad_norm_var": 0.023824055989583332, + "learning_rate": 0.0001, + "loss": 8.4463, + "loss/crossentropy": 2.303459644317627, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.28779207170009613, + "step": 2244 + }, + { + "epoch": 0.140375, + "grad_norm": 3.109375, + "grad_norm_var": 0.022484334309895833, + "learning_rate": 0.0001, + "loss": 8.5258, + "loss/crossentropy": 2.452125668525696, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.281644269824028, + "step": 2246 + }, + { + "epoch": 0.1405, + "grad_norm": 3.46875, + "grad_norm_var": 0.028857421875, + "learning_rate": 0.0001, + "loss": 8.4929, + "loss/crossentropy": 2.3145229816436768, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2791597843170166, + "step": 2248 + }, + { + "epoch": 0.140625, + "grad_norm": 3.40625, + "grad_norm_var": 0.10737202962239584, + "learning_rate": 0.0001, + "loss": 8.3374, + "loss/crossentropy": 2.219459652900696, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2960502803325653, + "step": 2250 + }, + { + "epoch": 0.14075, + "grad_norm": 3.296875, + "grad_norm_var": 0.10914306640625, + "learning_rate": 0.0001, + "loss": 8.4506, + "loss/crossentropy": 2.0157440304756165, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.2858085185289383, + "step": 2252 + }, + { + "epoch": 0.140875, + "grad_norm": 3.375, + "grad_norm_var": 0.10522359212239583, + "learning_rate": 0.0001, + "loss": 8.6443, + "loss/crossentropy": 2.3050343990325928, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2768677771091461, + "step": 2254 + }, + { + "epoch": 0.141, + "grad_norm": 2.875, + "grad_norm_var": 0.12316792805989583, + "learning_rate": 0.0001, + "loss": 8.1788, + "loss/crossentropy": 2.2533038854599, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.24813885986804962, + "step": 2256 + }, + { + "epoch": 0.141125, + "grad_norm": 3.3125, + "grad_norm_var": 0.12766927083333332, + "learning_rate": 0.0001, + "loss": 8.3708, + "loss/crossentropy": 2.4170485734939575, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2693845331668854, + "step": 2258 + }, + { + "epoch": 0.14125, + "grad_norm": 3.0625, + "grad_norm_var": 0.13147684733072917, + "learning_rate": 0.0001, + "loss": 8.4648, + "loss/crossentropy": 2.294472575187683, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2734464854001999, + "step": 2260 + }, + { + "epoch": 0.141375, + "grad_norm": 3.203125, + "grad_norm_var": 0.12929585774739583, + "learning_rate": 0.0001, + "loss": 8.1808, + "loss/crossentropy": 2.1154235005378723, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.28365112841129303, + "step": 2262 + }, + { + "epoch": 0.1415, + "grad_norm": 3.046875, + "grad_norm_var": 0.1328125, + "learning_rate": 0.0001, + "loss": 8.1941, + "loss/crossentropy": 2.1333194971084595, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.32095377147197723, + "step": 2264 + }, + { + "epoch": 0.141625, + "grad_norm": 3.375, + "grad_norm_var": 0.02467041015625, + "learning_rate": 0.0001, + "loss": 8.4459, + "loss/crossentropy": 2.3149880170822144, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2762584388256073, + "step": 2266 + }, + { + "epoch": 0.14175, + "grad_norm": 2.9375, + "grad_norm_var": 0.028120930989583334, + "learning_rate": 0.0001, + "loss": 8.422, + "loss/crossentropy": 2.6591659784317017, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.29070281982421875, + "step": 2268 + }, + { + "epoch": 0.141875, + "grad_norm": 3.359375, + "grad_norm_var": 0.026102701822916668, + "learning_rate": 0.0001, + "loss": 8.2709, + "loss/crossentropy": 2.3531733751296997, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.24519621580839157, + "step": 2270 + }, + { + "epoch": 0.142, + "grad_norm": 3.09375, + "grad_norm_var": 0.05454813639322917, + "learning_rate": 0.0001, + "loss": 8.2338, + "loss/crossentropy": 2.1661760807037354, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.24747133255004883, + "step": 2272 + }, + { + "epoch": 0.142125, + "grad_norm": 3.1875, + "grad_norm_var": 0.0540191650390625, + "learning_rate": 0.0001, + "loss": 8.5048, + "loss/crossentropy": 2.2301371097564697, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.269399493932724, + "step": 2274 + }, + { + "epoch": 0.14225, + "grad_norm": 3.640625, + "grad_norm_var": 0.06352437337239583, + "learning_rate": 0.0001, + "loss": 8.5953, + "loss/crossentropy": 2.65217924118042, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.288607120513916, + "step": 2276 + }, + { + "epoch": 0.142375, + "grad_norm": 2.9375, + "grad_norm_var": 0.07203369140625, + "learning_rate": 0.0001, + "loss": 8.0698, + "loss/crossentropy": 2.4561513662338257, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2938566952943802, + "step": 2278 + }, + { + "epoch": 0.1425, + "grad_norm": 3.421875, + "grad_norm_var": 0.0697174072265625, + "learning_rate": 0.0001, + "loss": 8.3514, + "loss/crossentropy": 2.517719268798828, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2574344277381897, + "step": 2280 + }, + { + "epoch": 0.142625, + "grad_norm": 3.078125, + "grad_norm_var": 0.07379150390625, + "learning_rate": 0.0001, + "loss": 8.404, + "loss/crossentropy": 2.507733106613159, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.28523363173007965, + "step": 2282 + }, + { + "epoch": 0.14275, + "grad_norm": 3.078125, + "grad_norm_var": 0.0678863525390625, + "learning_rate": 0.0001, + "loss": 8.3211, + "loss/crossentropy": 2.248465895652771, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.27300480008125305, + "step": 2284 + }, + { + "epoch": 0.142875, + "grad_norm": 3.40625, + "grad_norm_var": 0.06708577473958334, + "learning_rate": 0.0001, + "loss": 8.4819, + "loss/crossentropy": 2.175841212272644, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.2846294641494751, + "step": 2286 + }, + { + "epoch": 0.143, + "grad_norm": 3.328125, + "grad_norm_var": 0.043257649739583334, + "learning_rate": 0.0001, + "loss": 8.5189, + "loss/crossentropy": 2.53265118598938, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2701347917318344, + "step": 2288 + }, + { + "epoch": 0.143125, + "grad_norm": 3.15625, + "grad_norm_var": 0.0451812744140625, + "learning_rate": 0.0001, + "loss": 8.1393, + "loss/crossentropy": 2.2115447521209717, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.26285167038440704, + "step": 2290 + }, + { + "epoch": 0.14325, + "grad_norm": 2.953125, + "grad_norm_var": 0.038895670572916666, + "learning_rate": 0.0001, + "loss": 8.4895, + "loss/crossentropy": 2.540266752243042, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.29030686616897583, + "step": 2292 + }, + { + "epoch": 0.143375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0341705322265625, + "learning_rate": 0.0001, + "loss": 8.4026, + "loss/crossentropy": 2.361076593399048, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.26986297965049744, + "step": 2294 + }, + { + "epoch": 0.1435, + "grad_norm": 3.109375, + "grad_norm_var": 0.022297159830729166, + "learning_rate": 0.0001, + "loss": 8.4674, + "loss/crossentropy": 2.3896981477737427, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2720335051417351, + "step": 2296 + }, + { + "epoch": 0.143625, + "grad_norm": 3.171875, + "grad_norm_var": 0.022001139322916665, + "learning_rate": 0.0001, + "loss": 8.1891, + "loss/crossentropy": 2.1911017894744873, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26920412480831146, + "step": 2298 + }, + { + "epoch": 0.14375, + "grad_norm": 3.1875, + "grad_norm_var": 0.026090494791666665, + "learning_rate": 0.0001, + "loss": 8.3786, + "loss/crossentropy": 2.282583713531494, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.27442607283592224, + "step": 2300 + }, + { + "epoch": 0.143875, + "grad_norm": 3.140625, + "grad_norm_var": 0.015534464518229167, + "learning_rate": 0.0001, + "loss": 8.3697, + "loss/crossentropy": 2.3890000581741333, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.29275405406951904, + "step": 2302 + }, + { + "epoch": 0.144, + "grad_norm": 3.34375, + "grad_norm_var": 0.025178019205729166, + "learning_rate": 0.0001, + "loss": 8.6341, + "loss/crossentropy": 2.5382707118988037, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2767443507909775, + "step": 2304 + }, + { + "epoch": 0.144125, + "grad_norm": 3.0, + "grad_norm_var": 0.027099609375, + "learning_rate": 0.0001, + "loss": 8.427, + "loss/crossentropy": 2.423385739326477, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2653146833181381, + "step": 2306 + }, + { + "epoch": 0.14425, + "grad_norm": 3.140625, + "grad_norm_var": 0.023274739583333332, + "learning_rate": 0.0001, + "loss": 8.3893, + "loss/crossentropy": 2.351103663444519, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.2693053185939789, + "step": 2308 + }, + { + "epoch": 0.144375, + "grad_norm": 2.78125, + "grad_norm_var": 0.032957967122395834, + "learning_rate": 0.0001, + "loss": 8.1106, + "loss/crossentropy": 2.161897659301758, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2585082948207855, + "step": 2310 + }, + { + "epoch": 0.1445, + "grad_norm": 3.328125, + "grad_norm_var": 0.034468587239583334, + "learning_rate": 0.0001, + "loss": 8.5501, + "loss/crossentropy": 2.5974907875061035, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2823385000228882, + "step": 2312 + }, + { + "epoch": 0.144625, + "grad_norm": 3.125, + "grad_norm_var": 0.031636555989583336, + "learning_rate": 0.0001, + "loss": 8.1398, + "loss/crossentropy": 2.2245877981185913, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2538044899702072, + "step": 2314 + }, + { + "epoch": 0.14475, + "grad_norm": 3.140625, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 8.334, + "loss/crossentropy": 2.378359794616699, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27289170026779175, + "step": 2316 + }, + { + "epoch": 0.144875, + "grad_norm": 3.140625, + "grad_norm_var": 0.031180826822916667, + "learning_rate": 0.0001, + "loss": 8.32, + "loss/crossentropy": 2.2858855724334717, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2700234651565552, + "step": 2318 + }, + { + "epoch": 0.145, + "grad_norm": 2.96875, + "grad_norm_var": 0.016901652018229168, + "learning_rate": 0.0001, + "loss": 8.3103, + "loss/crossentropy": 2.4376027584075928, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.26375965774059296, + "step": 2320 + }, + { + "epoch": 0.145125, + "grad_norm": 3.265625, + "grad_norm_var": 0.019580078125, + "learning_rate": 0.0001, + "loss": 8.2817, + "loss/crossentropy": 2.491786479949951, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27208443731069565, + "step": 2322 + }, + { + "epoch": 0.14525, + "grad_norm": 3.34375, + "grad_norm_var": 0.022907511393229166, + "learning_rate": 0.0001, + "loss": 8.3947, + "loss/crossentropy": 2.1058656573295593, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2547650635242462, + "step": 2324 + }, + { + "epoch": 0.145375, + "grad_norm": 2.984375, + "grad_norm_var": 0.017171223958333332, + "learning_rate": 0.0001, + "loss": 8.2597, + "loss/crossentropy": 2.208884119987488, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.27819526195526123, + "step": 2326 + }, + { + "epoch": 0.1455, + "grad_norm": 3.765625, + "grad_norm_var": 0.04153645833333333, + "learning_rate": 0.0001, + "loss": 8.6312, + "loss/crossentropy": 2.467799663543701, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26558516919612885, + "step": 2328 + }, + { + "epoch": 0.145625, + "grad_norm": 3.109375, + "grad_norm_var": 0.04265034993489583, + "learning_rate": 0.0001, + "loss": 8.4207, + "loss/crossentropy": 2.418417453765869, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2655976563692093, + "step": 2330 + }, + { + "epoch": 0.14575, + "grad_norm": 3.203125, + "grad_norm_var": 0.040848795572916666, + "learning_rate": 0.0001, + "loss": 8.3613, + "loss/crossentropy": 2.321434497833252, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.30776557326316833, + "step": 2332 + }, + { + "epoch": 0.145875, + "grad_norm": 3.25, + "grad_norm_var": 0.04389546712239583, + "learning_rate": 0.0001, + "loss": 8.5335, + "loss/crossentropy": 2.321682333946228, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2719985991716385, + "step": 2334 + }, + { + "epoch": 0.146, + "grad_norm": 2.875, + "grad_norm_var": 0.04781901041666667, + "learning_rate": 0.0001, + "loss": 8.1955, + "loss/crossentropy": 2.179586887359619, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.24687421321868896, + "step": 2336 + }, + { + "epoch": 0.146125, + "grad_norm": 3.125, + "grad_norm_var": 0.043863932291666664, + "learning_rate": 0.0001, + "loss": 8.3332, + "loss/crossentropy": 2.24389386177063, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.29478612542152405, + "step": 2338 + }, + { + "epoch": 0.14625, + "grad_norm": 3.15625, + "grad_norm_var": 0.04262593587239583, + "learning_rate": 0.0001, + "loss": 8.5372, + "loss/crossentropy": 2.5267962217330933, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2840902507305145, + "step": 2340 + }, + { + "epoch": 0.146375, + "grad_norm": 3.078125, + "grad_norm_var": 0.041792805989583334, + "learning_rate": 0.0001, + "loss": 8.3559, + "loss/crossentropy": 2.2484867572784424, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.26582688093185425, + "step": 2342 + }, + { + "epoch": 0.1465, + "grad_norm": 3.34375, + "grad_norm_var": 0.021675618489583333, + "learning_rate": 0.0001, + "loss": 8.3478, + "loss/crossentropy": 2.3685790300369263, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27287431061267853, + "step": 2344 + }, + { + "epoch": 0.146625, + "grad_norm": 3.140625, + "grad_norm_var": 0.0241363525390625, + "learning_rate": 0.0001, + "loss": 8.625, + "loss/crossentropy": 2.430737853050232, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2730434536933899, + "step": 2346 + }, + { + "epoch": 0.14675, + "grad_norm": 2.890625, + "grad_norm_var": 0.07626546223958333, + "learning_rate": 0.0001, + "loss": 8.3045, + "loss/crossentropy": 2.4417322874069214, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.26828059554100037, + "step": 2348 + }, + { + "epoch": 0.146875, + "grad_norm": 3.90625, + "grad_norm_var": 0.11562398274739584, + "learning_rate": 0.0001, + "loss": 8.4202, + "loss/crossentropy": 2.2339513301849365, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2739114910364151, + "step": 2350 + }, + { + "epoch": 0.147, + "grad_norm": 3.21875, + "grad_norm_var": 0.10396219889322916, + "learning_rate": 0.0001, + "loss": 8.5676, + "loss/crossentropy": 2.539394974708557, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2761157304048538, + "step": 2352 + }, + { + "epoch": 0.147125, + "grad_norm": 3.25, + "grad_norm_var": 0.10067952473958333, + "learning_rate": 0.0001, + "loss": 8.2937, + "loss/crossentropy": 2.450527548789978, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.26511844992637634, + "step": 2354 + }, + { + "epoch": 0.14725, + "grad_norm": 3.0625, + "grad_norm_var": 0.10345052083333334, + "learning_rate": 0.0001, + "loss": 8.1971, + "loss/crossentropy": 2.2373549938201904, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.26237648725509644, + "step": 2356 + }, + { + "epoch": 0.147375, + "grad_norm": 3.59375, + "grad_norm_var": 0.09674072265625, + "learning_rate": 0.0001, + "loss": 8.4327, + "loss/crossentropy": 2.2837640047073364, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2683512270450592, + "step": 2358 + }, + { + "epoch": 0.1475, + "grad_norm": 3.09375, + "grad_norm_var": 0.10239156087239583, + "learning_rate": 0.0001, + "loss": 8.3287, + "loss/crossentropy": 2.238037943840027, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2713322341442108, + "step": 2360 + }, + { + "epoch": 0.147625, + "grad_norm": 3.0625, + "grad_norm_var": 0.104736328125, + "learning_rate": 0.0001, + "loss": 8.4571, + "loss/crossentropy": 2.5709011554718018, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27457040548324585, + "step": 2362 + }, + { + "epoch": 0.14775, + "grad_norm": 3.1875, + "grad_norm_var": 0.0671783447265625, + "learning_rate": 0.0001, + "loss": 8.4263, + "loss/crossentropy": 2.3125263452529907, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.28667566180229187, + "step": 2364 + }, + { + "epoch": 0.147875, + "grad_norm": 3.328125, + "grad_norm_var": 0.025983683268229165, + "learning_rate": 0.0001, + "loss": 8.2756, + "loss/crossentropy": 2.230543076992035, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2760595977306366, + "step": 2366 + }, + { + "epoch": 0.148, + "grad_norm": 3.078125, + "grad_norm_var": 0.021598307291666667, + "learning_rate": 0.0001, + "loss": 8.3417, + "loss/crossentropy": 2.6067885160446167, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2718808054924011, + "step": 2368 + }, + { + "epoch": 0.148125, + "grad_norm": 3.5, + "grad_norm_var": 0.0322418212890625, + "learning_rate": 0.0001, + "loss": 8.4566, + "loss/crossentropy": 2.4617605209350586, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2566886991262436, + "step": 2370 + }, + { + "epoch": 0.14825, + "grad_norm": 2.9375, + "grad_norm_var": 0.03632405598958333, + "learning_rate": 0.0001, + "loss": 8.3095, + "loss/crossentropy": 2.545408248901367, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.28086233139038086, + "step": 2372 + }, + { + "epoch": 0.148375, + "grad_norm": 3.125, + "grad_norm_var": 0.025862630208333334, + "learning_rate": 0.0001, + "loss": 8.4444, + "loss/crossentropy": 2.310088276863098, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.28762035071849823, + "step": 2374 + }, + { + "epoch": 0.1485, + "grad_norm": 3.234375, + "grad_norm_var": 0.0256500244140625, + "learning_rate": 0.0001, + "loss": 8.4027, + "loss/crossentropy": 2.306009292602539, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.251963771879673, + "step": 2376 + }, + { + "epoch": 0.148625, + "grad_norm": 3.21875, + "grad_norm_var": 0.023368326822916667, + "learning_rate": 0.0001, + "loss": 8.2834, + "loss/crossentropy": 2.531379818916321, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.2890370786190033, + "step": 2378 + }, + { + "epoch": 0.14875, + "grad_norm": 3.09375, + "grad_norm_var": 0.022652180989583333, + "learning_rate": 0.0001, + "loss": 8.5298, + "loss/crossentropy": 2.4868820905685425, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.28367380797863007, + "step": 2380 + }, + { + "epoch": 0.148875, + "grad_norm": 3.109375, + "grad_norm_var": 0.022391764322916667, + "learning_rate": 0.0001, + "loss": 8.2447, + "loss/crossentropy": 2.4238349199295044, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2638505697250366, + "step": 2382 + }, + { + "epoch": 0.149, + "grad_norm": 3.046875, + "grad_norm_var": 0.023078409830729167, + "learning_rate": 0.0001, + "loss": 8.1903, + "loss/crossentropy": 2.428895592689514, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2814117968082428, + "step": 2384 + }, + { + "epoch": 0.149125, + "grad_norm": 3.015625, + "grad_norm_var": 0.07431233723958333, + "learning_rate": 0.0001, + "loss": 8.3687, + "loss/crossentropy": 2.4161367416381836, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.29132652282714844, + "step": 2386 + }, + { + "epoch": 0.14925, + "grad_norm": 3.0625, + "grad_norm_var": 0.06982014973958334, + "learning_rate": 0.0001, + "loss": 8.3171, + "loss/crossentropy": 2.121293306350708, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2542402744293213, + "step": 2388 + }, + { + "epoch": 0.149375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0720123291015625, + "learning_rate": 0.0001, + "loss": 8.0846, + "loss/crossentropy": 2.355462431907654, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.262673944234848, + "step": 2390 + }, + { + "epoch": 0.1495, + "grad_norm": 4.15625, + "grad_norm_var": 0.12470703125, + "learning_rate": 0.0001, + "loss": 8.5856, + "loss/crossentropy": 2.4837170839309692, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.27958710491657257, + "step": 2392 + }, + { + "epoch": 0.149625, + "grad_norm": 6.90625, + "grad_norm_var": 0.9296295166015625, + "learning_rate": 0.0001, + "loss": 8.3578, + "loss/crossentropy": 2.365368127822876, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2629378139972687, + "step": 2394 + }, + { + "epoch": 0.14975, + "grad_norm": 3.4375, + "grad_norm_var": 0.9130360921223958, + "learning_rate": 0.0001, + "loss": 8.2264, + "loss/crossentropy": 2.2927812337875366, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2774803042411804, + "step": 2396 + }, + { + "epoch": 0.149875, + "grad_norm": 3.21875, + "grad_norm_var": 0.9198527018229167, + "learning_rate": 0.0001, + "loss": 8.3089, + "loss/crossentropy": 2.3293185234069824, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.25967323780059814, + "step": 2398 + }, + { + "epoch": 0.15, + "grad_norm": 3.5625, + "grad_norm_var": 0.9080891927083333, + "learning_rate": 0.0001, + "loss": 8.4132, + "loss/crossentropy": 2.153883457183838, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.26730459928512573, + "step": 2400 + }, + { + "epoch": 0.150125, + "grad_norm": 3.734375, + "grad_norm_var": 0.8680084228515625, + "learning_rate": 0.0001, + "loss": 8.216, + "loss/crossentropy": 2.3419724702835083, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2801203727722168, + "step": 2402 + }, + { + "epoch": 0.15025, + "grad_norm": 3.46875, + "grad_norm_var": 0.8511301676432291, + "learning_rate": 0.0001, + "loss": 8.5621, + "loss/crossentropy": 2.2398444414138794, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2656910568475723, + "step": 2404 + }, + { + "epoch": 0.150375, + "grad_norm": 3.1875, + "grad_norm_var": 0.8526926676432292, + "learning_rate": 0.0001, + "loss": 8.5041, + "loss/crossentropy": 2.604948043823242, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.26598772406578064, + "step": 2406 + }, + { + "epoch": 0.1505, + "grad_norm": 3.296875, + "grad_norm_var": 0.8330393473307292, + "learning_rate": 0.0001, + "loss": 8.5201, + "loss/crossentropy": 2.3795058727264404, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.2822035402059555, + "step": 2408 + }, + { + "epoch": 0.150625, + "grad_norm": 3.03125, + "grad_norm_var": 0.05833231608072917, + "learning_rate": 0.0001, + "loss": 8.2627, + "loss/crossentropy": 2.440226197242737, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2705456465482712, + "step": 2410 + }, + { + "epoch": 0.15075, + "grad_norm": 3.265625, + "grad_norm_var": 0.05461324055989583, + "learning_rate": 0.0001, + "loss": 8.4428, + "loss/crossentropy": 2.473353385925293, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27890461683273315, + "step": 2412 + }, + { + "epoch": 0.150875, + "grad_norm": 3.078125, + "grad_norm_var": 0.05740559895833333, + "learning_rate": 0.0001, + "loss": 8.049, + "loss/crossentropy": 2.4567281007766724, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.28310708701610565, + "step": 2414 + }, + { + "epoch": 0.151, + "grad_norm": 3.28125, + "grad_norm_var": 0.05642903645833333, + "learning_rate": 0.0001, + "loss": 8.706, + "loss/crossentropy": 2.2676972150802612, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2879829406738281, + "step": 2416 + }, + { + "epoch": 0.151125, + "grad_norm": 3.46875, + "grad_norm_var": 0.045084635416666664, + "learning_rate": 0.0001, + "loss": 8.6477, + "loss/crossentropy": 2.4988842010498047, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2907903641462326, + "step": 2418 + }, + { + "epoch": 0.15125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0454498291015625, + "learning_rate": 0.0001, + "loss": 8.3035, + "loss/crossentropy": 2.390307307243347, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2815091013908386, + "step": 2420 + }, + { + "epoch": 0.151375, + "grad_norm": 2.984375, + "grad_norm_var": 0.050080362955729166, + "learning_rate": 0.0001, + "loss": 8.1178, + "loss/crossentropy": 2.188577175140381, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2619321793317795, + "step": 2422 + }, + { + "epoch": 0.1515, + "grad_norm": 3.609375, + "grad_norm_var": 0.04468994140625, + "learning_rate": 0.0001, + "loss": 8.7316, + "loss/crossentropy": 2.1936429142951965, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2852545231580734, + "step": 2424 + }, + { + "epoch": 0.151625, + "grad_norm": 3.15625, + "grad_norm_var": 0.04058837890625, + "learning_rate": 0.0001, + "loss": 8.3339, + "loss/crossentropy": 2.1987831592559814, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2727145105600357, + "step": 2426 + }, + { + "epoch": 0.15175, + "grad_norm": 3.171875, + "grad_norm_var": 0.043505859375, + "learning_rate": 0.0001, + "loss": 8.2834, + "loss/crossentropy": 2.6830883026123047, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2955752909183502, + "step": 2428 + }, + { + "epoch": 0.151875, + "grad_norm": 3.125, + "grad_norm_var": 0.0397857666015625, + "learning_rate": 0.0001, + "loss": 8.2492, + "loss/crossentropy": 2.3668758869171143, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2798495292663574, + "step": 2430 + }, + { + "epoch": 0.152, + "grad_norm": 3.015625, + "grad_norm_var": 0.032502237955729166, + "learning_rate": 0.0001, + "loss": 8.219, + "loss/crossentropy": 2.3704047203063965, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.259890541434288, + "step": 2432 + }, + { + "epoch": 0.152125, + "grad_norm": 3.296875, + "grad_norm_var": 0.047337849934895836, + "learning_rate": 0.0001, + "loss": 8.4497, + "loss/crossentropy": 2.1782950162887573, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2903287261724472, + "step": 2434 + }, + { + "epoch": 0.15225, + "grad_norm": 3.09375, + "grad_norm_var": 0.04848531087239583, + "learning_rate": 0.0001, + "loss": 8.4486, + "loss/crossentropy": 2.4774245023727417, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27125996351242065, + "step": 2436 + }, + { + "epoch": 0.152375, + "grad_norm": 3.21875, + "grad_norm_var": 0.04543863932291667, + "learning_rate": 0.0001, + "loss": 8.3811, + "loss/crossentropy": 2.3557363748550415, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.30695493519306183, + "step": 2438 + }, + { + "epoch": 0.1525, + "grad_norm": 3.234375, + "grad_norm_var": 0.034235636393229164, + "learning_rate": 0.0001, + "loss": 8.0553, + "loss/crossentropy": 2.362632989883423, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.26735785603523254, + "step": 2440 + }, + { + "epoch": 0.152625, + "grad_norm": 3.078125, + "grad_norm_var": 0.03411051432291667, + "learning_rate": 0.0001, + "loss": 8.179, + "loss/crossentropy": 2.4381628036499023, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2577895522117615, + "step": 2442 + }, + { + "epoch": 0.15275, + "grad_norm": 3.046875, + "grad_norm_var": 0.0404449462890625, + "learning_rate": 0.0001, + "loss": 8.3991, + "loss/crossentropy": 2.350975751876831, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.27325308322906494, + "step": 2444 + }, + { + "epoch": 0.152875, + "grad_norm": 3.28125, + "grad_norm_var": 0.03941650390625, + "learning_rate": 0.0001, + "loss": 8.536, + "loss/crossentropy": 2.4157899618148804, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.27011483907699585, + "step": 2446 + }, + { + "epoch": 0.153, + "grad_norm": 4.5, + "grad_norm_var": 4.062442016601563, + "learning_rate": 0.0001, + "loss": 8.919, + "loss/crossentropy": 2.437688112258911, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.31261374056339264, + "step": 2448 + }, + { + "epoch": 0.153125, + "grad_norm": 3.234375, + "grad_norm_var": 4.074149576822917, + "learning_rate": 0.0001, + "loss": 8.5219, + "loss/crossentropy": 2.4884743690490723, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.281438983976841, + "step": 2450 + }, + { + "epoch": 0.15325, + "grad_norm": 3.0, + "grad_norm_var": 4.093941243489583, + "learning_rate": 0.0001, + "loss": 8.2878, + "loss/crossentropy": 2.3718087673187256, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2740315645933151, + "step": 2452 + }, + { + "epoch": 0.153375, + "grad_norm": 3.0625, + "grad_norm_var": 4.100650024414063, + "learning_rate": 0.0001, + "loss": 8.5518, + "loss/crossentropy": 2.2542319297790527, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.267962783575058, + "step": 2454 + }, + { + "epoch": 0.1535, + "grad_norm": 3.171875, + "grad_norm_var": 4.085700480143229, + "learning_rate": 0.0001, + "loss": 8.3585, + "loss/crossentropy": 2.4759573936462402, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2723170071840286, + "step": 2456 + }, + { + "epoch": 0.153625, + "grad_norm": 3.1875, + "grad_norm_var": 4.062303670247396, + "learning_rate": 0.0001, + "loss": 8.17, + "loss/crossentropy": 2.260656952857971, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.25791803002357483, + "step": 2458 + }, + { + "epoch": 0.15375, + "grad_norm": 3.296875, + "grad_norm_var": 4.06168212890625, + "learning_rate": 0.0001, + "loss": 8.2435, + "loss/crossentropy": 2.2933273315429688, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2662912607192993, + "step": 2460 + }, + { + "epoch": 0.153875, + "grad_norm": 3.125, + "grad_norm_var": 4.067943318684896, + "learning_rate": 0.0001, + "loss": 8.049, + "loss/crossentropy": 2.1535520553588867, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2517680525779724, + "step": 2462 + }, + { + "epoch": 0.154, + "grad_norm": 3.0625, + "grad_norm_var": 0.0138580322265625, + "learning_rate": 0.0001, + "loss": 8.0467, + "loss/crossentropy": 2.0775814056396484, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.25869153439998627, + "step": 2464 + }, + { + "epoch": 0.154125, + "grad_norm": 2.984375, + "grad_norm_var": 0.012906901041666667, + "learning_rate": 0.0001, + "loss": 8.1568, + "loss/crossentropy": 2.466973304748535, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.27992746233940125, + "step": 2466 + }, + { + "epoch": 0.15425, + "grad_norm": 3.265625, + "grad_norm_var": 0.01207275390625, + "learning_rate": 0.0001, + "loss": 8.0886, + "loss/crossentropy": 2.3320833444595337, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.27216966450214386, + "step": 2468 + }, + { + "epoch": 0.154375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0121490478515625, + "learning_rate": 0.0001, + "loss": 8.109, + "loss/crossentropy": 2.2403723001480103, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.24139465391635895, + "step": 2470 + }, + { + "epoch": 0.1545, + "grad_norm": 3.109375, + "grad_norm_var": 0.010472615559895834, + "learning_rate": 0.0001, + "loss": 8.56, + "loss/crossentropy": 2.1147449016571045, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2546851634979248, + "step": 2472 + }, + { + "epoch": 0.154625, + "grad_norm": 3.140625, + "grad_norm_var": 0.009496053059895834, + "learning_rate": 0.0001, + "loss": 8.2829, + "loss/crossentropy": 2.128211796283722, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2636387571692467, + "step": 2474 + }, + { + "epoch": 0.15475, + "grad_norm": 3.21875, + "grad_norm_var": 0.008250935872395834, + "learning_rate": 0.0001, + "loss": 8.4869, + "loss/crossentropy": 2.5061731338500977, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.27180950343608856, + "step": 2476 + }, + { + "epoch": 0.154875, + "grad_norm": 3.296875, + "grad_norm_var": 0.009325154622395833, + "learning_rate": 0.0001, + "loss": 8.2537, + "loss/crossentropy": 1.90971839427948, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2415306195616722, + "step": 2478 + }, + { + "epoch": 0.155, + "grad_norm": 2.859375, + "grad_norm_var": 0.01353759765625, + "learning_rate": 0.0001, + "loss": 7.9013, + "loss/crossentropy": 2.1773123145103455, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2478671371936798, + "step": 2480 + }, + { + "epoch": 0.155125, + "grad_norm": 6.90625, + "grad_norm_var": 0.901953125, + "learning_rate": 0.0001, + "loss": 8.4798, + "loss/crossentropy": 2.3107967376708984, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2525716871023178, + "step": 2482 + }, + { + "epoch": 0.15525, + "grad_norm": 3.3125, + "grad_norm_var": 0.8995513916015625, + "learning_rate": 0.0001, + "loss": 8.4744, + "loss/crossentropy": 2.247706890106201, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.3075176626443863, + "step": 2484 + }, + { + "epoch": 0.155375, + "grad_norm": 3.203125, + "grad_norm_var": 0.89342041015625, + "learning_rate": 0.0001, + "loss": 8.1333, + "loss/crossentropy": 2.3638851642608643, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.272746741771698, + "step": 2486 + }, + { + "epoch": 0.1555, + "grad_norm": 3.515625, + "grad_norm_var": 0.8865071614583333, + "learning_rate": 0.0001, + "loss": 8.2671, + "loss/crossentropy": 2.267830967903137, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.28789058327674866, + "step": 2488 + }, + { + "epoch": 0.155625, + "grad_norm": 3.140625, + "grad_norm_var": 0.8851552327473958, + "learning_rate": 0.0001, + "loss": 8.1937, + "loss/crossentropy": 2.165649652481079, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.26410168409347534, + "step": 2490 + }, + { + "epoch": 0.15575, + "grad_norm": 3.109375, + "grad_norm_var": 0.8902496337890625, + "learning_rate": 0.0001, + "loss": 8.348, + "loss/crossentropy": 2.2928093671798706, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2730616182088852, + "step": 2492 + }, + { + "epoch": 0.155875, + "grad_norm": 3.6875, + "grad_norm_var": 1.3131256103515625, + "learning_rate": 0.0001, + "loss": 9.0116, + "loss/crossentropy": 2.288671135902405, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25715528428554535, + "step": 2494 + }, + { + "epoch": 0.156, + "grad_norm": 3.515625, + "grad_norm_var": 1.245361328125, + "learning_rate": 0.0001, + "loss": 8.2275, + "loss/crossentropy": 2.04764986038208, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2550469785928726, + "step": 2496 + }, + { + "epoch": 0.156125, + "grad_norm": 3.265625, + "grad_norm_var": 0.5328196207682292, + "learning_rate": 0.0001, + "loss": 8.3368, + "loss/crossentropy": 2.1027456521987915, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.27562327682971954, + "step": 2498 + }, + { + "epoch": 0.15625, + "grad_norm": 3.375, + "grad_norm_var": 0.5341054280598958, + "learning_rate": 0.0001, + "loss": 8.3857, + "loss/crossentropy": 2.3927990198135376, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.2799486666917801, + "step": 2500 + }, + { + "epoch": 0.156375, + "grad_norm": 3.421875, + "grad_norm_var": 0.5267567952473958, + "learning_rate": 0.0001, + "loss": 8.3894, + "loss/crossentropy": 2.379324197769165, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27068574726581573, + "step": 2502 + }, + { + "epoch": 0.1565, + "grad_norm": 3.0625, + "grad_norm_var": 0.5542805989583334, + "learning_rate": 0.0001, + "loss": 8.6141, + "loss/crossentropy": 2.1458650827407837, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.3379057049751282, + "step": 2504 + }, + { + "epoch": 0.156625, + "grad_norm": 3.140625, + "grad_norm_var": 0.5693359375, + "learning_rate": 0.0001, + "loss": 8.3573, + "loss/crossentropy": 2.2261852025985718, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.27541545033454895, + "step": 2506 + }, + { + "epoch": 0.15675, + "grad_norm": 3.15625, + "grad_norm_var": 0.56923828125, + "learning_rate": 0.0001, + "loss": 8.2813, + "loss/crossentropy": 2.300438404083252, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.26637162268161774, + "step": 2508 + }, + { + "epoch": 0.156875, + "grad_norm": 3.203125, + "grad_norm_var": 0.05836181640625, + "learning_rate": 0.0001, + "loss": 8.1558, + "loss/crossentropy": 2.4900972843170166, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26418106257915497, + "step": 2510 + }, + { + "epoch": 0.157, + "grad_norm": 3.15625, + "grad_norm_var": 0.0244781494140625, + "learning_rate": 0.0001, + "loss": 8.3795, + "loss/crossentropy": 2.510451316833496, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2828855812549591, + "step": 2512 + }, + { + "epoch": 0.157125, + "grad_norm": 3.0625, + "grad_norm_var": 0.023856608072916667, + "learning_rate": 0.0001, + "loss": 8.2548, + "loss/crossentropy": 2.3360713720321655, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2564430236816406, + "step": 2514 + }, + { + "epoch": 0.15725, + "grad_norm": 3.265625, + "grad_norm_var": 0.018147786458333332, + "learning_rate": 0.0001, + "loss": 8.4346, + "loss/crossentropy": 2.3846248388290405, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.28445227444171906, + "step": 2516 + }, + { + "epoch": 0.157375, + "grad_norm": 3.34375, + "grad_norm_var": 0.019938151041666668, + "learning_rate": 0.0001, + "loss": 8.2736, + "loss/crossentropy": 2.2677204608917236, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2820749282836914, + "step": 2518 + }, + { + "epoch": 0.1575, + "grad_norm": 3.0, + "grad_norm_var": 0.023974609375, + "learning_rate": 0.0001, + "loss": 8.2038, + "loss/crossentropy": 2.469799518585205, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.25989628583192825, + "step": 2520 + }, + { + "epoch": 0.157625, + "grad_norm": 3.28125, + "grad_norm_var": 0.024702962239583334, + "learning_rate": 0.0001, + "loss": 8.3751, + "loss/crossentropy": 2.4882609844207764, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2699503153562546, + "step": 2522 + }, + { + "epoch": 0.15775, + "grad_norm": 3.25, + "grad_norm_var": 0.026123046875, + "learning_rate": 0.0001, + "loss": 8.4694, + "loss/crossentropy": 2.5819085836410522, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27766086161136627, + "step": 2524 + }, + { + "epoch": 0.157875, + "grad_norm": 2.90625, + "grad_norm_var": 0.025072224934895835, + "learning_rate": 0.0001, + "loss": 8.1619, + "loss/crossentropy": 2.2840874791145325, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.29234474897384644, + "step": 2526 + }, + { + "epoch": 0.158, + "grad_norm": 3.234375, + "grad_norm_var": 0.026513671875, + "learning_rate": 0.0001, + "loss": 8.4086, + "loss/crossentropy": 2.4526052474975586, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.28289155662059784, + "step": 2528 + }, + { + "epoch": 0.158125, + "grad_norm": 3.4375, + "grad_norm_var": 0.026513671875, + "learning_rate": 0.0001, + "loss": 8.7414, + "loss/crossentropy": 2.654939293861389, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.2989393472671509, + "step": 2530 + }, + { + "epoch": 0.15825, + "grad_norm": 2.953125, + "grad_norm_var": 0.030028279622395834, + "learning_rate": 0.0001, + "loss": 8.2029, + "loss/crossentropy": 2.375227451324463, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.29081976413726807, + "step": 2532 + }, + { + "epoch": 0.158375, + "grad_norm": 2.953125, + "grad_norm_var": 0.024201456705729166, + "learning_rate": 0.0001, + "loss": 8.2599, + "loss/crossentropy": 2.2531535625457764, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.26515205204486847, + "step": 2534 + }, + { + "epoch": 0.1585, + "grad_norm": 3.203125, + "grad_norm_var": 0.0256744384765625, + "learning_rate": 0.0001, + "loss": 8.6216, + "loss/crossentropy": 2.3937805891036987, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.29868148267269135, + "step": 2536 + }, + { + "epoch": 0.158625, + "grad_norm": 3.609375, + "grad_norm_var": 0.03931884765625, + "learning_rate": 0.0001, + "loss": 8.7359, + "loss/crossentropy": 2.475276470184326, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.28303323686122894, + "step": 2538 + }, + { + "epoch": 0.15875, + "grad_norm": 3.328125, + "grad_norm_var": 0.040526326497395834, + "learning_rate": 0.0001, + "loss": 8.1197, + "loss/crossentropy": 2.0302132964134216, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2279423400759697, + "step": 2540 + }, + { + "epoch": 0.158875, + "grad_norm": 3.0625, + "grad_norm_var": 0.036774698893229166, + "learning_rate": 0.0001, + "loss": 8.4462, + "loss/crossentropy": 2.604537010192871, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.2865126132965088, + "step": 2542 + }, + { + "epoch": 0.159, + "grad_norm": 3.109375, + "grad_norm_var": 0.03657124837239583, + "learning_rate": 0.0001, + "loss": 8.4149, + "loss/crossentropy": 2.3719130754470825, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2572627142071724, + "step": 2544 + }, + { + "epoch": 0.159125, + "grad_norm": 3.0625, + "grad_norm_var": 0.033772786458333336, + "learning_rate": 0.0001, + "loss": 8.226, + "loss/crossentropy": 2.290672540664673, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.3219170421361923, + "step": 2546 + }, + { + "epoch": 0.15925, + "grad_norm": 3.046875, + "grad_norm_var": 0.031636555989583336, + "learning_rate": 0.0001, + "loss": 8.2854, + "loss/crossentropy": 2.145516276359558, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2706049084663391, + "step": 2548 + }, + { + "epoch": 0.159375, + "grad_norm": 3.359375, + "grad_norm_var": 0.03004150390625, + "learning_rate": 0.0001, + "loss": 8.2928, + "loss/crossentropy": 2.5301170349121094, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2519679069519043, + "step": 2550 + }, + { + "epoch": 0.1595, + "grad_norm": 3.109375, + "grad_norm_var": 0.0247955322265625, + "learning_rate": 0.0001, + "loss": 8.3513, + "loss/crossentropy": 2.4736965894699097, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.27385158836841583, + "step": 2552 + }, + { + "epoch": 0.159625, + "grad_norm": 3.0625, + "grad_norm_var": 0.010282389322916667, + "learning_rate": 0.0001, + "loss": 8.4279, + "loss/crossentropy": 2.6575098037719727, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.28586356341838837, + "step": 2554 + }, + { + "epoch": 0.15975, + "grad_norm": 3.203125, + "grad_norm_var": 0.010188802083333334, + "learning_rate": 0.0001, + "loss": 8.3428, + "loss/crossentropy": 2.436043381690979, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2793993800878525, + "step": 2556 + }, + { + "epoch": 0.159875, + "grad_norm": 3.09375, + "grad_norm_var": 0.01025390625, + "learning_rate": 0.0001, + "loss": 8.2126, + "loss/crossentropy": 2.388734817504883, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.24712733924388885, + "step": 2558 + }, + { + "epoch": 0.16, + "grad_norm": 3.3125, + "grad_norm_var": 0.012140909830729166, + "learning_rate": 0.0001, + "loss": 8.341, + "loss/crossentropy": 2.488566756248474, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.28663991391658783, + "step": 2560 + }, + { + "epoch": 0.160125, + "grad_norm": 3.0, + "grad_norm_var": 0.012626139322916667, + "learning_rate": 0.0001, + "loss": 8.1953, + "loss/crossentropy": 2.360735058784485, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2733890935778618, + "step": 2562 + }, + { + "epoch": 0.16025, + "grad_norm": 3.125, + "grad_norm_var": 0.012262980143229166, + "learning_rate": 0.0001, + "loss": 8.2249, + "loss/crossentropy": 2.49215030670166, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.29839469492435455, + "step": 2564 + }, + { + "epoch": 0.160375, + "grad_norm": 2.96875, + "grad_norm_var": 0.00963134765625, + "learning_rate": 0.0001, + "loss": 8.2714, + "loss/crossentropy": 2.4546769857406616, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2619743049144745, + "step": 2566 + }, + { + "epoch": 0.1605, + "grad_norm": 3.1875, + "grad_norm_var": 0.013232421875, + "learning_rate": 0.0001, + "loss": 8.3612, + "loss/crossentropy": 2.429980993270874, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2659847140312195, + "step": 2568 + }, + { + "epoch": 0.160625, + "grad_norm": 3.0625, + "grad_norm_var": 0.013704427083333333, + "learning_rate": 0.0001, + "loss": 8.1659, + "loss/crossentropy": 2.3688398003578186, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.23781683295965195, + "step": 2570 + }, + { + "epoch": 0.16075, + "grad_norm": 3.046875, + "grad_norm_var": 0.026318359375, + "learning_rate": 0.0001, + "loss": 8.346, + "loss/crossentropy": 2.4594470262527466, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2740607261657715, + "step": 2572 + }, + { + "epoch": 0.160875, + "grad_norm": 3.265625, + "grad_norm_var": 0.027985636393229166, + "learning_rate": 0.0001, + "loss": 8.268, + "loss/crossentropy": 2.194278836250305, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26395024359226227, + "step": 2574 + }, + { + "epoch": 0.161, + "grad_norm": 2.875, + "grad_norm_var": 0.029378255208333332, + "learning_rate": 0.0001, + "loss": 8.1956, + "loss/crossentropy": 2.1503721475601196, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24605603516101837, + "step": 2576 + }, + { + "epoch": 0.161125, + "grad_norm": 3.59375, + "grad_norm_var": 0.046971638997395836, + "learning_rate": 0.0001, + "loss": 8.2973, + "loss/crossentropy": 2.331193447113037, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2738886624574661, + "step": 2578 + }, + { + "epoch": 0.16125, + "grad_norm": 3.09375, + "grad_norm_var": 0.046971638997395836, + "learning_rate": 0.0001, + "loss": 8.3282, + "loss/crossentropy": 2.3280651569366455, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.280707448720932, + "step": 2580 + }, + { + "epoch": 0.161375, + "grad_norm": 3.1875, + "grad_norm_var": 0.04517822265625, + "learning_rate": 0.0001, + "loss": 8.2786, + "loss/crossentropy": 2.2947434186935425, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.28385594487190247, + "step": 2582 + }, + { + "epoch": 0.1615, + "grad_norm": 3.09375, + "grad_norm_var": 0.03827718098958333, + "learning_rate": 0.0001, + "loss": 8.3341, + "loss/crossentropy": 2.4629331827163696, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2666812241077423, + "step": 2584 + }, + { + "epoch": 0.161625, + "grad_norm": 3.25, + "grad_norm_var": 0.0423248291015625, + "learning_rate": 0.0001, + "loss": 8.6395, + "loss/crossentropy": 2.167258083820343, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.31057223677635193, + "step": 2586 + }, + { + "epoch": 0.16175, + "grad_norm": 4.5, + "grad_norm_var": 0.1481109619140625, + "learning_rate": 0.0001, + "loss": 8.2884, + "loss/crossentropy": 2.116983652114868, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.2682839557528496, + "step": 2588 + }, + { + "epoch": 0.161875, + "grad_norm": 3.34375, + "grad_norm_var": 0.15274149576822918, + "learning_rate": 0.0001, + "loss": 8.7022, + "loss/crossentropy": 2.733883261680603, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.3234509229660034, + "step": 2590 + }, + { + "epoch": 0.162, + "grad_norm": 3.0625, + "grad_norm_var": 0.14871317545572918, + "learning_rate": 0.0001, + "loss": 8.1477, + "loss/crossentropy": 2.146886646747589, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26147788763046265, + "step": 2592 + }, + { + "epoch": 0.162125, + "grad_norm": 2.921875, + "grad_norm_var": 0.14615885416666666, + "learning_rate": 0.0001, + "loss": 8.1554, + "loss/crossentropy": 2.311735153198242, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26781047880649567, + "step": 2594 + }, + { + "epoch": 0.16225, + "grad_norm": 3.140625, + "grad_norm_var": 0.14546610514322916, + "learning_rate": 0.0001, + "loss": 8.177, + "loss/crossentropy": 2.4164129495620728, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26862579584121704, + "step": 2596 + }, + { + "epoch": 0.162375, + "grad_norm": 3.234375, + "grad_norm_var": 0.14485270182291668, + "learning_rate": 0.0001, + "loss": 8.2389, + "loss/crossentropy": 2.1019481420516968, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.26436343789100647, + "step": 2598 + }, + { + "epoch": 0.1625, + "grad_norm": 3.34375, + "grad_norm_var": 0.14814046223958333, + "learning_rate": 0.0001, + "loss": 8.481, + "loss/crossentropy": 2.262871265411377, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2700708210468292, + "step": 2600 + }, + { + "epoch": 0.162625, + "grad_norm": 3.234375, + "grad_norm_var": 0.14741923014322916, + "learning_rate": 0.0001, + "loss": 8.2838, + "loss/crossentropy": 2.635279059410095, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.29194803535938263, + "step": 2602 + }, + { + "epoch": 0.16275, + "grad_norm": 2.984375, + "grad_norm_var": 0.030952962239583333, + "learning_rate": 0.0001, + "loss": 8.2658, + "loss/crossentropy": 2.4246195554733276, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2945238947868347, + "step": 2604 + }, + { + "epoch": 0.162875, + "grad_norm": 3.25, + "grad_norm_var": 0.0225738525390625, + "learning_rate": 0.0001, + "loss": 8.2605, + "loss/crossentropy": 2.0880810022354126, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.263704776763916, + "step": 2606 + }, + { + "epoch": 0.163, + "grad_norm": 3.03125, + "grad_norm_var": 0.024706013997395835, + "learning_rate": 0.0001, + "loss": 8.1831, + "loss/crossentropy": 2.080967903137207, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.24847671389579773, + "step": 2608 + }, + { + "epoch": 0.163125, + "grad_norm": 3.125, + "grad_norm_var": 0.019270833333333334, + "learning_rate": 0.0001, + "loss": 8.416, + "loss/crossentropy": 2.363895058631897, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.28405146300792694, + "step": 2610 + }, + { + "epoch": 0.16325, + "grad_norm": 3.0, + "grad_norm_var": 0.0218658447265625, + "learning_rate": 0.0001, + "loss": 8.3748, + "loss/crossentropy": 2.4413585662841797, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.28752467036247253, + "step": 2612 + }, + { + "epoch": 0.163375, + "grad_norm": 3.109375, + "grad_norm_var": 0.022728474934895833, + "learning_rate": 0.0001, + "loss": 8.2726, + "loss/crossentropy": 2.3740620613098145, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2649526298046112, + "step": 2614 + }, + { + "epoch": 0.1635, + "grad_norm": 3.203125, + "grad_norm_var": 0.027534993489583333, + "learning_rate": 0.0001, + "loss": 8.3047, + "loss/crossentropy": 2.2964380979537964, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2646304666996002, + "step": 2616 + }, + { + "epoch": 0.163625, + "grad_norm": 3.4375, + "grad_norm_var": 0.038736979166666664, + "learning_rate": 0.0001, + "loss": 8.2126, + "loss/crossentropy": 2.1447632908821106, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24735675752162933, + "step": 2618 + }, + { + "epoch": 0.16375, + "grad_norm": 3.625, + "grad_norm_var": 0.0572418212890625, + "learning_rate": 0.0001, + "loss": 8.5174, + "loss/crossentropy": 2.433838725090027, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2876787632703781, + "step": 2620 + }, + { + "epoch": 0.163875, + "grad_norm": 3.03125, + "grad_norm_var": 0.059521484375, + "learning_rate": 0.0001, + "loss": 8.0701, + "loss/crossentropy": 2.204833507537842, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2602345570921898, + "step": 2622 + }, + { + "epoch": 0.164, + "grad_norm": 3.375, + "grad_norm_var": 0.0607086181640625, + "learning_rate": 0.0001, + "loss": 8.3564, + "loss/crossentropy": 2.4403984546661377, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.28500233590602875, + "step": 2624 + }, + { + "epoch": 0.164125, + "grad_norm": 3.25, + "grad_norm_var": 0.06425374348958333, + "learning_rate": 0.0001, + "loss": 8.4073, + "loss/crossentropy": 2.461033344268799, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2762896418571472, + "step": 2626 + }, + { + "epoch": 0.16425, + "grad_norm": 3.359375, + "grad_norm_var": 0.0610748291015625, + "learning_rate": 0.0001, + "loss": 8.2333, + "loss/crossentropy": 2.3412665128707886, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2702695280313492, + "step": 2628 + }, + { + "epoch": 0.164375, + "grad_norm": 3.078125, + "grad_norm_var": 0.0517974853515625, + "learning_rate": 0.0001, + "loss": 8.471, + "loss/crossentropy": 2.395104169845581, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.266902893781662, + "step": 2630 + }, + { + "epoch": 0.1645, + "grad_norm": 3.0625, + "grad_norm_var": 0.0504058837890625, + "learning_rate": 0.0001, + "loss": 8.4257, + "loss/crossentropy": 2.5319817066192627, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.28416794538497925, + "step": 2632 + }, + { + "epoch": 0.164625, + "grad_norm": 3.59375, + "grad_norm_var": 0.048291015625, + "learning_rate": 0.0001, + "loss": 8.0315, + "loss/crossentropy": 2.1557105779647827, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2669526934623718, + "step": 2634 + }, + { + "epoch": 0.16475, + "grad_norm": 2.96875, + "grad_norm_var": 0.04273681640625, + "learning_rate": 0.0001, + "loss": 8.3233, + "loss/crossentropy": 2.5935251712799072, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.24305777996778488, + "step": 2636 + }, + { + "epoch": 0.164875, + "grad_norm": 3.1875, + "grad_norm_var": 0.033910115559895836, + "learning_rate": 0.0001, + "loss": 8.4456, + "loss/crossentropy": 2.489887237548828, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.26338575780391693, + "step": 2638 + }, + { + "epoch": 0.165, + "grad_norm": 2.859375, + "grad_norm_var": 0.03730061848958333, + "learning_rate": 0.0001, + "loss": 8.0705, + "loss/crossentropy": 2.252521276473999, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.24118152260780334, + "step": 2640 + }, + { + "epoch": 0.165125, + "grad_norm": 3.1875, + "grad_norm_var": 0.03505757649739583, + "learning_rate": 0.0001, + "loss": 8.007, + "loss/crossentropy": 2.2367311120033264, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.25346408784389496, + "step": 2642 + }, + { + "epoch": 0.16525, + "grad_norm": 3.15625, + "grad_norm_var": 0.03387044270833333, + "learning_rate": 0.0001, + "loss": 8.2971, + "loss/crossentropy": 2.2261271476745605, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.2599840611219406, + "step": 2644 + }, + { + "epoch": 0.165375, + "grad_norm": 3.078125, + "grad_norm_var": 0.0338775634765625, + "learning_rate": 0.0001, + "loss": 8.2174, + "loss/crossentropy": 2.3545104265213013, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.268455445766449, + "step": 2646 + }, + { + "epoch": 0.1655, + "grad_norm": 3.234375, + "grad_norm_var": 0.037816365559895836, + "learning_rate": 0.0001, + "loss": 8.1461, + "loss/crossentropy": 2.2744003534317017, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.23194655776023865, + "step": 2648 + }, + { + "epoch": 0.165625, + "grad_norm": 3.140625, + "grad_norm_var": 0.0201812744140625, + "learning_rate": 0.0001, + "loss": 8.4255, + "loss/crossentropy": 2.292569160461426, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.25596070289611816, + "step": 2650 + }, + { + "epoch": 0.16575, + "grad_norm": 3.046875, + "grad_norm_var": 0.024983723958333332, + "learning_rate": 0.0001, + "loss": 8.1794, + "loss/crossentropy": 2.06977915763855, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.22923216223716736, + "step": 2652 + }, + { + "epoch": 0.165875, + "grad_norm": 3.25, + "grad_norm_var": 0.025641886393229167, + "learning_rate": 0.0001, + "loss": 8.4059, + "loss/crossentropy": 2.262938976287842, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2663477659225464, + "step": 2654 + }, + { + "epoch": 0.166, + "grad_norm": 3.203125, + "grad_norm_var": 0.0209625244140625, + "learning_rate": 0.0001, + "loss": 8.337, + "loss/crossentropy": 1.9289529919624329, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2564796954393387, + "step": 2656 + }, + { + "epoch": 0.166125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0193359375, + "learning_rate": 0.0001, + "loss": 8.2522, + "loss/crossentropy": 2.4027702808380127, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.282346248626709, + "step": 2658 + }, + { + "epoch": 0.16625, + "grad_norm": 2.859375, + "grad_norm_var": 0.0200103759765625, + "learning_rate": 0.0001, + "loss": 8.1585, + "loss/crossentropy": 2.121252119541168, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26361793279647827, + "step": 2660 + }, + { + "epoch": 0.166375, + "grad_norm": 3.234375, + "grad_norm_var": 0.02109375, + "learning_rate": 0.0001, + "loss": 8.1217, + "loss/crossentropy": 2.355001926422119, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.25889521837234497, + "step": 2662 + }, + { + "epoch": 0.1665, + "grad_norm": 3.125, + "grad_norm_var": 0.01630859375, + "learning_rate": 0.0001, + "loss": 8.2613, + "loss/crossentropy": 2.3688048124313354, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27535010874271393, + "step": 2664 + }, + { + "epoch": 0.166625, + "grad_norm": 3.140625, + "grad_norm_var": 0.014225260416666666, + "learning_rate": 0.0001, + "loss": 8.0862, + "loss/crossentropy": 2.3740497827529907, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2727824002504349, + "step": 2666 + }, + { + "epoch": 0.16675, + "grad_norm": 3.1875, + "grad_norm_var": 0.015738932291666667, + "learning_rate": 0.0001, + "loss": 8.3176, + "loss/crossentropy": 2.3888481855392456, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2630738615989685, + "step": 2668 + }, + { + "epoch": 0.166875, + "grad_norm": 3.03125, + "grad_norm_var": 0.016942342122395832, + "learning_rate": 0.0001, + "loss": 8.156, + "loss/crossentropy": 2.0409966707229614, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2550422251224518, + "step": 2670 + }, + { + "epoch": 0.167, + "grad_norm": 3.171875, + "grad_norm_var": 0.017236328125, + "learning_rate": 0.0001, + "loss": 8.4412, + "loss/crossentropy": 2.5270535945892334, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.29448163509368896, + "step": 2672 + }, + { + "epoch": 0.167125, + "grad_norm": 2.96875, + "grad_norm_var": 0.028661092122395832, + "learning_rate": 0.0001, + "loss": 8.0915, + "loss/crossentropy": 2.235354781150818, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.26712487637996674, + "step": 2674 + }, + { + "epoch": 0.16725, + "grad_norm": 3.09375, + "grad_norm_var": 0.0249664306640625, + "learning_rate": 0.0001, + "loss": 7.9645, + "loss/crossentropy": 2.1596986055374146, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2632744610309601, + "step": 2676 + }, + { + "epoch": 0.167375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0266510009765625, + "learning_rate": 0.0001, + "loss": 8.1883, + "loss/crossentropy": 2.2907787561416626, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2716197818517685, + "step": 2678 + }, + { + "epoch": 0.1675, + "grad_norm": 2.796875, + "grad_norm_var": 0.0345367431640625, + "learning_rate": 0.0001, + "loss": 8.1149, + "loss/crossentropy": 2.200819969177246, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.28321780264377594, + "step": 2680 + }, + { + "epoch": 0.167625, + "grad_norm": 3.0625, + "grad_norm_var": 0.03452860514322917, + "learning_rate": 0.0001, + "loss": 8.3545, + "loss/crossentropy": 2.5655312538146973, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.26827070116996765, + "step": 2682 + }, + { + "epoch": 0.16775, + "grad_norm": 2.875, + "grad_norm_var": 0.02847900390625, + "learning_rate": 0.0001, + "loss": 7.964, + "loss/crossentropy": 2.368655800819397, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.25074201822280884, + "step": 2684 + }, + { + "epoch": 0.167875, + "grad_norm": 2.890625, + "grad_norm_var": 0.030647786458333333, + "learning_rate": 0.0001, + "loss": 8.2303, + "loss/crossentropy": 2.522045135498047, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.289681613445282, + "step": 2686 + }, + { + "epoch": 0.168, + "grad_norm": 3.109375, + "grad_norm_var": 0.0275299072265625, + "learning_rate": 0.0001, + "loss": 8.212, + "loss/crossentropy": 2.3031221628189087, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2869686484336853, + "step": 2688 + }, + { + "epoch": 0.168125, + "grad_norm": 3.1875, + "grad_norm_var": 0.014427693684895833, + "learning_rate": 0.0001, + "loss": 8.1903, + "loss/crossentropy": 2.2180505990982056, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2845149040222168, + "step": 2690 + }, + { + "epoch": 0.16825, + "grad_norm": 3.078125, + "grad_norm_var": 0.014420572916666667, + "learning_rate": 0.0001, + "loss": 8.1318, + "loss/crossentropy": 2.1774216294288635, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.27391664683818817, + "step": 2692 + }, + { + "epoch": 0.168375, + "grad_norm": 2.90625, + "grad_norm_var": 0.016747029622395833, + "learning_rate": 0.0001, + "loss": 8.1924, + "loss/crossentropy": 2.3473572731018066, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.25838781893253326, + "step": 2694 + }, + { + "epoch": 0.1685, + "grad_norm": 3.546875, + "grad_norm_var": 0.028385416666666666, + "learning_rate": 0.0001, + "loss": 8.1097, + "loss/crossentropy": 2.3881657123565674, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2789221554994583, + "step": 2696 + }, + { + "epoch": 0.168625, + "grad_norm": 3.3125, + "grad_norm_var": 0.061421712239583336, + "learning_rate": 0.0001, + "loss": 8.1672, + "loss/crossentropy": 2.1731218099594116, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.24963228404521942, + "step": 2698 + }, + { + "epoch": 0.16875, + "grad_norm": 3.125, + "grad_norm_var": 0.051423136393229166, + "learning_rate": 0.0001, + "loss": 8.3729, + "loss/crossentropy": 2.2801939249038696, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26613737642765045, + "step": 2700 + }, + { + "epoch": 0.168875, + "grad_norm": 3.25, + "grad_norm_var": 0.045653279622395834, + "learning_rate": 0.0001, + "loss": 8.3193, + "loss/crossentropy": 2.245741128921509, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.27671462297439575, + "step": 2702 + }, + { + "epoch": 0.169, + "grad_norm": 3.125, + "grad_norm_var": 0.043680826822916664, + "learning_rate": 0.0001, + "loss": 8.3377, + "loss/crossentropy": 2.2457879781723022, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.26047301292419434, + "step": 2704 + }, + { + "epoch": 0.169125, + "grad_norm": 3.125, + "grad_norm_var": 0.04413960774739583, + "learning_rate": 0.0001, + "loss": 8.177, + "loss/crossentropy": 2.5720086097717285, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2824274003505707, + "step": 2706 + }, + { + "epoch": 0.16925, + "grad_norm": 3.078125, + "grad_norm_var": 0.04474995930989583, + "learning_rate": 0.0001, + "loss": 8.2283, + "loss/crossentropy": 2.1722124814987183, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.28728562593460083, + "step": 2708 + }, + { + "epoch": 0.169375, + "grad_norm": 3.140625, + "grad_norm_var": 0.04452718098958333, + "learning_rate": 0.0001, + "loss": 8.3363, + "loss/crossentropy": 2.4735668897628784, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.279041588306427, + "step": 2710 + }, + { + "epoch": 0.1695, + "grad_norm": 3.125, + "grad_norm_var": 0.03596903483072917, + "learning_rate": 0.0001, + "loss": 8.3597, + "loss/crossentropy": 2.4349948167800903, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.3199606239795685, + "step": 2712 + }, + { + "epoch": 0.169625, + "grad_norm": 2.984375, + "grad_norm_var": 0.015458170572916667, + "learning_rate": 0.0001, + "loss": 8.3392, + "loss/crossentropy": 2.2743479013442993, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.25963588804006577, + "step": 2714 + }, + { + "epoch": 0.16975, + "grad_norm": 2.984375, + "grad_norm_var": 0.017975870768229166, + "learning_rate": 0.0001, + "loss": 8.1114, + "loss/crossentropy": 2.3894020318984985, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.26319143176078796, + "step": 2716 + }, + { + "epoch": 0.169875, + "grad_norm": 2.90625, + "grad_norm_var": 0.022412109375, + "learning_rate": 0.0001, + "loss": 8.119, + "loss/crossentropy": 2.4183582067489624, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2642783522605896, + "step": 2718 + }, + { + "epoch": 0.17, + "grad_norm": 3.203125, + "grad_norm_var": 0.025560506184895835, + "learning_rate": 0.0001, + "loss": 8.1457, + "loss/crossentropy": 2.218894124031067, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2494705617427826, + "step": 2720 + }, + { + "epoch": 0.170125, + "grad_norm": 3.21875, + "grad_norm_var": 0.029157511393229165, + "learning_rate": 0.0001, + "loss": 8.4283, + "loss/crossentropy": 2.3041937351226807, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.27814508974552155, + "step": 2722 + }, + { + "epoch": 0.17025, + "grad_norm": 2.953125, + "grad_norm_var": 0.03127848307291667, + "learning_rate": 0.0001, + "loss": 8.2177, + "loss/crossentropy": 2.212631940841675, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25353027880191803, + "step": 2724 + }, + { + "epoch": 0.170375, + "grad_norm": 3.375, + "grad_norm_var": 0.024665323893229167, + "learning_rate": 0.0001, + "loss": 8.2822, + "loss/crossentropy": 2.395217537879944, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2819966673851013, + "step": 2726 + }, + { + "epoch": 0.1705, + "grad_norm": 3.09375, + "grad_norm_var": 0.027074178059895832, + "learning_rate": 0.0001, + "loss": 8.6286, + "loss/crossentropy": 2.7145198583602905, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.26581692695617676, + "step": 2728 + }, + { + "epoch": 0.170625, + "grad_norm": 3.1875, + "grad_norm_var": 0.026227823893229165, + "learning_rate": 0.0001, + "loss": 8.3205, + "loss/crossentropy": 2.30352520942688, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2516012564301491, + "step": 2730 + }, + { + "epoch": 0.17075, + "grad_norm": 2.9375, + "grad_norm_var": 0.0263336181640625, + "learning_rate": 0.0001, + "loss": 8.1426, + "loss/crossentropy": 2.5575900077819824, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.25674693286418915, + "step": 2732 + }, + { + "epoch": 0.170875, + "grad_norm": 3.203125, + "grad_norm_var": 0.02105712890625, + "learning_rate": 0.0001, + "loss": 8.2278, + "loss/crossentropy": 2.5535298585891724, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2858099341392517, + "step": 2734 + }, + { + "epoch": 0.171, + "grad_norm": 3.09375, + "grad_norm_var": 0.018220011393229166, + "learning_rate": 0.0001, + "loss": 7.9985, + "loss/crossentropy": 2.249255061149597, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.24624134600162506, + "step": 2736 + }, + { + "epoch": 0.171125, + "grad_norm": 2.890625, + "grad_norm_var": 0.019391886393229165, + "learning_rate": 0.0001, + "loss": 8.2368, + "loss/crossentropy": 2.3054301738739014, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.255063071846962, + "step": 2738 + }, + { + "epoch": 0.17125, + "grad_norm": 3.046875, + "grad_norm_var": 0.017801920572916668, + "learning_rate": 0.0001, + "loss": 8.3647, + "loss/crossentropy": 2.4051593542099, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.3074689656496048, + "step": 2740 + }, + { + "epoch": 0.171375, + "grad_norm": 3.359375, + "grad_norm_var": 0.017365519205729166, + "learning_rate": 0.0001, + "loss": 8.1814, + "loss/crossentropy": 2.451170325279236, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2629931569099426, + "step": 2742 + }, + { + "epoch": 0.1715, + "grad_norm": 3.078125, + "grad_norm_var": 0.014802042643229167, + "learning_rate": 0.0001, + "loss": 8.2261, + "loss/crossentropy": 2.5258573293685913, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.28129981458187103, + "step": 2744 + }, + { + "epoch": 0.171625, + "grad_norm": 3.046875, + "grad_norm_var": 0.014232381184895834, + "learning_rate": 0.0001, + "loss": 8.2026, + "loss/crossentropy": 2.59726619720459, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.27656108140945435, + "step": 2746 + }, + { + "epoch": 0.17175, + "grad_norm": 3.09375, + "grad_norm_var": 0.012984212239583333, + "learning_rate": 0.0001, + "loss": 8.3833, + "loss/crossentropy": 2.590458631515503, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2706581801176071, + "step": 2748 + }, + { + "epoch": 0.171875, + "grad_norm": 2.96875, + "grad_norm_var": 0.014058430989583334, + "learning_rate": 0.0001, + "loss": 8.0601, + "loss/crossentropy": 2.3810765743255615, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.24706510454416275, + "step": 2750 + }, + { + "epoch": 0.172, + "grad_norm": 2.921875, + "grad_norm_var": 0.015360514322916666, + "learning_rate": 0.0001, + "loss": 8.0355, + "loss/crossentropy": 2.258496880531311, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26913100481033325, + "step": 2752 + }, + { + "epoch": 0.172125, + "grad_norm": 3.296875, + "grad_norm_var": 0.0178131103515625, + "learning_rate": 0.0001, + "loss": 8.2945, + "loss/crossentropy": 2.477080225944519, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.26651376485824585, + "step": 2754 + }, + { + "epoch": 0.17225, + "grad_norm": 3.09375, + "grad_norm_var": 0.0175933837890625, + "learning_rate": 0.0001, + "loss": 8.1, + "loss/crossentropy": 2.3166109323501587, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.24783167243003845, + "step": 2756 + }, + { + "epoch": 0.172375, + "grad_norm": 2.875, + "grad_norm_var": 0.014012654622395834, + "learning_rate": 0.0001, + "loss": 8.1753, + "loss/crossentropy": 2.3947906494140625, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2784908413887024, + "step": 2758 + }, + { + "epoch": 0.1725, + "grad_norm": 3.171875, + "grad_norm_var": 0.0149078369140625, + "learning_rate": 0.0001, + "loss": 8.298, + "loss/crossentropy": 2.308673143386841, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2578892260789871, + "step": 2760 + }, + { + "epoch": 0.172625, + "grad_norm": 3.140625, + "grad_norm_var": 0.015816243489583333, + "learning_rate": 0.0001, + "loss": 8.277, + "loss/crossentropy": 2.410847306251526, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2580881118774414, + "step": 2762 + }, + { + "epoch": 0.17275, + "grad_norm": 2.90625, + "grad_norm_var": 0.0188385009765625, + "learning_rate": 0.0001, + "loss": 8.3098, + "loss/crossentropy": 2.4470525979995728, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.25711022317409515, + "step": 2764 + }, + { + "epoch": 0.172875, + "grad_norm": 3.125, + "grad_norm_var": 0.018603515625, + "learning_rate": 0.0001, + "loss": 8.384, + "loss/crossentropy": 2.5611066818237305, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2998035103082657, + "step": 2766 + }, + { + "epoch": 0.173, + "grad_norm": 2.984375, + "grad_norm_var": 0.016258748372395833, + "learning_rate": 0.0001, + "loss": 8.3617, + "loss/crossentropy": 2.5504335165023804, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.27010831236839294, + "step": 2768 + }, + { + "epoch": 0.173125, + "grad_norm": 2.703125, + "grad_norm_var": 0.020897420247395833, + "learning_rate": 0.0001, + "loss": 8.0711, + "loss/crossentropy": 2.2784290313720703, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.25674042105674744, + "step": 2770 + }, + { + "epoch": 0.17325, + "grad_norm": 3.25, + "grad_norm_var": 0.022782389322916666, + "learning_rate": 0.0001, + "loss": 8.2164, + "loss/crossentropy": 2.421698570251465, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2708132416009903, + "step": 2772 + }, + { + "epoch": 0.173375, + "grad_norm": 3.703125, + "grad_norm_var": 0.05318603515625, + "learning_rate": 0.0001, + "loss": 8.2761, + "loss/crossentropy": 2.1522774696350098, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.24617627263069153, + "step": 2774 + }, + { + "epoch": 0.1735, + "grad_norm": 3.09375, + "grad_norm_var": 0.0539459228515625, + "learning_rate": 0.0001, + "loss": 8.3435, + "loss/crossentropy": 2.5082781314849854, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27957382798194885, + "step": 2776 + }, + { + "epoch": 0.173625, + "grad_norm": 3.21875, + "grad_norm_var": 0.05487874348958333, + "learning_rate": 0.0001, + "loss": 8.0423, + "loss/crossentropy": 2.2934749126434326, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2753216177225113, + "step": 2778 + }, + { + "epoch": 0.17375, + "grad_norm": 2.984375, + "grad_norm_var": 0.05186258951822917, + "learning_rate": 0.0001, + "loss": 8.3362, + "loss/crossentropy": 2.4466487169265747, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2686302661895752, + "step": 2780 + }, + { + "epoch": 0.173875, + "grad_norm": 3.0, + "grad_norm_var": 0.05314839680989583, + "learning_rate": 0.0001, + "loss": 8.2283, + "loss/crossentropy": 2.2464054822921753, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.27123279869556427, + "step": 2782 + }, + { + "epoch": 0.174, + "grad_norm": 3.046875, + "grad_norm_var": 0.0550933837890625, + "learning_rate": 0.0001, + "loss": 8.2158, + "loss/crossentropy": 2.3913623094558716, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.27313700318336487, + "step": 2784 + }, + { + "epoch": 0.174125, + "grad_norm": 3.0625, + "grad_norm_var": 0.0407623291015625, + "learning_rate": 0.0001, + "loss": 8.2836, + "loss/crossentropy": 2.512578248977661, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2598787695169449, + "step": 2786 + }, + { + "epoch": 0.17425, + "grad_norm": 2.984375, + "grad_norm_var": 0.04078369140625, + "learning_rate": 0.0001, + "loss": 8.0561, + "loss/crossentropy": 2.5069663524627686, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.273094579577446, + "step": 2788 + }, + { + "epoch": 0.174375, + "grad_norm": 3.15625, + "grad_norm_var": 0.012300618489583333, + "learning_rate": 0.0001, + "loss": 8.1484, + "loss/crossentropy": 2.0232608318328857, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2657552883028984, + "step": 2790 + }, + { + "epoch": 0.1745, + "grad_norm": 2.96875, + "grad_norm_var": 0.017220052083333333, + "learning_rate": 0.0001, + "loss": 8.4669, + "loss/crossentropy": 2.387427568435669, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2625594586133957, + "step": 2792 + }, + { + "epoch": 0.174625, + "grad_norm": 3.03125, + "grad_norm_var": 0.016405232747395835, + "learning_rate": 0.0001, + "loss": 8.3922, + "loss/crossentropy": 2.3673804998397827, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.28117281198501587, + "step": 2794 + }, + { + "epoch": 0.17475, + "grad_norm": 3.265625, + "grad_norm_var": 0.03388264973958333, + "learning_rate": 0.0001, + "loss": 8.2787, + "loss/crossentropy": 2.4962185621261597, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.286916047334671, + "step": 2796 + }, + { + "epoch": 0.174875, + "grad_norm": 3.40625, + "grad_norm_var": 0.03957926432291667, + "learning_rate": 0.0001, + "loss": 8.2466, + "loss/crossentropy": 2.418305993080139, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2539825364947319, + "step": 2798 + }, + { + "epoch": 0.175, + "grad_norm": 2.984375, + "grad_norm_var": 0.038304646809895836, + "learning_rate": 0.0001, + "loss": 8.1027, + "loss/crossentropy": 2.1551238298416138, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2831410765647888, + "step": 2800 + }, + { + "epoch": 0.175125, + "grad_norm": 3.125, + "grad_norm_var": 0.03675130208333333, + "learning_rate": 0.0001, + "loss": 8.3028, + "loss/crossentropy": 2.063372015953064, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2392185851931572, + "step": 2802 + }, + { + "epoch": 0.17525, + "grad_norm": 3.25, + "grad_norm_var": 0.03528645833333333, + "learning_rate": 0.0001, + "loss": 8.3566, + "loss/crossentropy": 2.4662251472473145, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2643844783306122, + "step": 2804 + }, + { + "epoch": 0.175375, + "grad_norm": 3.03125, + "grad_norm_var": 0.03893229166666667, + "learning_rate": 0.0001, + "loss": 8.2343, + "loss/crossentropy": 2.275088667869568, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25276893377304077, + "step": 2806 + }, + { + "epoch": 0.1755, + "grad_norm": 3.15625, + "grad_norm_var": 0.030692545572916667, + "learning_rate": 0.0001, + "loss": 8.2212, + "loss/crossentropy": 2.20920592546463, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.25742238759994507, + "step": 2808 + }, + { + "epoch": 0.175625, + "grad_norm": 3.125, + "grad_norm_var": 0.028450520833333333, + "learning_rate": 0.0001, + "loss": 8.4952, + "loss/crossentropy": 2.378191828727722, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.3033263385295868, + "step": 2810 + }, + { + "epoch": 0.17575, + "grad_norm": 3.09375, + "grad_norm_var": 0.0159332275390625, + "learning_rate": 0.0001, + "loss": 8.3287, + "loss/crossentropy": 2.3606228828430176, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.26286470890045166, + "step": 2812 + }, + { + "epoch": 0.175875, + "grad_norm": 3.171875, + "grad_norm_var": 0.0115631103515625, + "learning_rate": 0.0001, + "loss": 8.0903, + "loss/crossentropy": 2.5904735326766968, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.28542497754096985, + "step": 2814 + }, + { + "epoch": 0.176, + "grad_norm": 3.015625, + "grad_norm_var": 0.010970052083333333, + "learning_rate": 0.0001, + "loss": 8.2841, + "loss/crossentropy": 2.1312711238861084, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2546592205762863, + "step": 2816 + }, + { + "epoch": 0.176125, + "grad_norm": 3.203125, + "grad_norm_var": 0.011747233072916667, + "learning_rate": 0.0001, + "loss": 8.3833, + "loss/crossentropy": 2.4488954544067383, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.27959881722927094, + "step": 2818 + }, + { + "epoch": 0.17625, + "grad_norm": 3.125, + "grad_norm_var": 0.010041300455729167, + "learning_rate": 0.0001, + "loss": 8.364, + "loss/crossentropy": 2.4785863161087036, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.26612453162670135, + "step": 2820 + }, + { + "epoch": 0.176375, + "grad_norm": 3.171875, + "grad_norm_var": 0.008503214518229166, + "learning_rate": 0.0001, + "loss": 8.1876, + "loss/crossentropy": 2.344822645187378, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.24801667034626007, + "step": 2822 + }, + { + "epoch": 0.1765, + "grad_norm": 2.984375, + "grad_norm_var": 0.009000651041666667, + "learning_rate": 0.0001, + "loss": 8.2002, + "loss/crossentropy": 2.2322527170181274, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.23924288898706436, + "step": 2824 + }, + { + "epoch": 0.176625, + "grad_norm": 2.84375, + "grad_norm_var": 0.011747233072916667, + "learning_rate": 0.0001, + "loss": 8.2038, + "loss/crossentropy": 2.2701025009155273, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24792873859405518, + "step": 2826 + }, + { + "epoch": 0.17675, + "grad_norm": 2.8125, + "grad_norm_var": 0.017313639322916668, + "learning_rate": 0.0001, + "loss": 8.0395, + "loss/crossentropy": 2.2964216470718384, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.24705877900123596, + "step": 2828 + }, + { + "epoch": 0.176875, + "grad_norm": 2.953125, + "grad_norm_var": 0.017378743489583334, + "learning_rate": 0.0001, + "loss": 8.4679, + "loss/crossentropy": 2.418280839920044, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.3100636601448059, + "step": 2830 + }, + { + "epoch": 0.177, + "grad_norm": 3.375, + "grad_norm_var": 0.026488240559895834, + "learning_rate": 0.0001, + "loss": 8.1278, + "loss/crossentropy": 2.4353508949279785, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.2712879627943039, + "step": 2832 + }, + { + "epoch": 0.177125, + "grad_norm": 2.96875, + "grad_norm_var": 0.023363240559895835, + "learning_rate": 0.0001, + "loss": 8.4328, + "loss/crossentropy": 2.277379631996155, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2677570581436157, + "step": 2834 + }, + { + "epoch": 0.17725, + "grad_norm": 3.078125, + "grad_norm_var": 0.024348958333333334, + "learning_rate": 0.0001, + "loss": 7.8742, + "loss/crossentropy": 2.3050626516342163, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25615155696868896, + "step": 2836 + }, + { + "epoch": 0.177375, + "grad_norm": 2.671875, + "grad_norm_var": 0.031102498372395832, + "learning_rate": 0.0001, + "loss": 8.1044, + "loss/crossentropy": 2.2054827213287354, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.26621413230895996, + "step": 2838 + }, + { + "epoch": 0.1775, + "grad_norm": 3.40625, + "grad_norm_var": 0.04169820149739583, + "learning_rate": 0.0001, + "loss": 8.0213, + "loss/crossentropy": 2.344091534614563, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.26311126351356506, + "step": 2840 + }, + { + "epoch": 0.177625, + "grad_norm": 2.9375, + "grad_norm_var": 0.039525349934895836, + "learning_rate": 0.0001, + "loss": 8.2404, + "loss/crossentropy": 2.3576395511627197, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.26957815885543823, + "step": 2842 + }, + { + "epoch": 0.17775, + "grad_norm": 3.03125, + "grad_norm_var": 0.03683268229166667, + "learning_rate": 0.0001, + "loss": 8.1148, + "loss/crossentropy": 2.0740586519241333, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2444992959499359, + "step": 2844 + }, + { + "epoch": 0.177875, + "grad_norm": 2.875, + "grad_norm_var": 0.03797098795572917, + "learning_rate": 0.0001, + "loss": 8.0264, + "loss/crossentropy": 2.196990489959717, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2584913969039917, + "step": 2846 + }, + { + "epoch": 0.178, + "grad_norm": 3.078125, + "grad_norm_var": 0.029931640625, + "learning_rate": 0.0001, + "loss": 8.0912, + "loss/crossentropy": 2.465600848197937, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.31103046238422394, + "step": 2848 + }, + { + "epoch": 0.178125, + "grad_norm": 3.390625, + "grad_norm_var": 0.03961588541666667, + "learning_rate": 0.0001, + "loss": 8.6973, + "loss/crossentropy": 2.542473077774048, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2756101042032242, + "step": 2850 + }, + { + "epoch": 0.17825, + "grad_norm": 3.046875, + "grad_norm_var": 0.038248697916666664, + "learning_rate": 0.0001, + "loss": 8.0748, + "loss/crossentropy": 2.357996344566345, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2627618610858917, + "step": 2852 + }, + { + "epoch": 0.178375, + "grad_norm": 3.125, + "grad_norm_var": 0.0279449462890625, + "learning_rate": 0.0001, + "loss": 8.2433, + "loss/crossentropy": 2.387884736061096, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2805600166320801, + "step": 2854 + }, + { + "epoch": 0.1785, + "grad_norm": 3.21875, + "grad_norm_var": 0.02301025390625, + "learning_rate": 0.0001, + "loss": 8.2198, + "loss/crossentropy": 2.2316168546676636, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2630866765975952, + "step": 2856 + }, + { + "epoch": 0.178625, + "grad_norm": 3.171875, + "grad_norm_var": 0.024898274739583334, + "learning_rate": 0.0001, + "loss": 8.1996, + "loss/crossentropy": 2.388888955116272, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2638123258948326, + "step": 2858 + }, + { + "epoch": 0.17875, + "grad_norm": 2.890625, + "grad_norm_var": 0.025519816080729167, + "learning_rate": 0.0001, + "loss": 8.2193, + "loss/crossentropy": 2.575498104095459, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2572275549173355, + "step": 2860 + }, + { + "epoch": 0.178875, + "grad_norm": 3.171875, + "grad_norm_var": 0.024837239583333334, + "learning_rate": 0.0001, + "loss": 8.3578, + "loss/crossentropy": 2.524762511253357, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.3061821162700653, + "step": 2862 + }, + { + "epoch": 0.179, + "grad_norm": 3.125, + "grad_norm_var": 0.025716145833333332, + "learning_rate": 0.0001, + "loss": 8.2261, + "loss/crossentropy": 2.2874268293380737, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26778069138526917, + "step": 2864 + }, + { + "epoch": 0.179125, + "grad_norm": 3.203125, + "grad_norm_var": 0.022591145833333333, + "learning_rate": 0.0001, + "loss": 8.3879, + "loss/crossentropy": 2.28354811668396, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26096734404563904, + "step": 2866 + }, + { + "epoch": 0.17925, + "grad_norm": 3.15625, + "grad_norm_var": 0.025211588541666666, + "learning_rate": 0.0001, + "loss": 8.2155, + "loss/crossentropy": 2.1607202291488647, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.26596730947494507, + "step": 2868 + }, + { + "epoch": 0.179375, + "grad_norm": 3.53125, + "grad_norm_var": 0.0307769775390625, + "learning_rate": 0.0001, + "loss": 8.2848, + "loss/crossentropy": 2.4421184062957764, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25268222391605377, + "step": 2870 + }, + { + "epoch": 0.1795, + "grad_norm": 3.078125, + "grad_norm_var": 0.03218994140625, + "learning_rate": 0.0001, + "loss": 8.184, + "loss/crossentropy": 2.24539315700531, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2509969249367714, + "step": 2872 + }, + { + "epoch": 0.179625, + "grad_norm": 2.9375, + "grad_norm_var": 0.031126912434895834, + "learning_rate": 0.0001, + "loss": 8.0381, + "loss/crossentropy": 2.1964842081069946, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.261972077190876, + "step": 2874 + }, + { + "epoch": 0.17975, + "grad_norm": 3.140625, + "grad_norm_var": 0.026756795247395833, + "learning_rate": 0.0001, + "loss": 8.2261, + "loss/crossentropy": 2.2588162422180176, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.28808237612247467, + "step": 2876 + }, + { + "epoch": 0.179875, + "grad_norm": 2.890625, + "grad_norm_var": 0.03379618326822917, + "learning_rate": 0.0001, + "loss": 8.163, + "loss/crossentropy": 2.177803933620453, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.27027180790901184, + "step": 2878 + }, + { + "epoch": 0.18, + "grad_norm": 3.109375, + "grad_norm_var": 0.030500284830729165, + "learning_rate": 0.0001, + "loss": 8.236, + "loss/crossentropy": 2.0715816020965576, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25250908732414246, + "step": 2880 + }, + { + "epoch": 0.180125, + "grad_norm": 2.953125, + "grad_norm_var": 0.031086222330729166, + "learning_rate": 0.0001, + "loss": 8.1597, + "loss/crossentropy": 2.3349109888076782, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2642297148704529, + "step": 2882 + }, + { + "epoch": 0.18025, + "grad_norm": 3.21875, + "grad_norm_var": 0.02681884765625, + "learning_rate": 0.0001, + "loss": 8.0044, + "loss/crossentropy": 2.1576240062713623, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.25084151327610016, + "step": 2884 + }, + { + "epoch": 0.180375, + "grad_norm": 2.921875, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 8.0148, + "loss/crossentropy": 2.1341161131858826, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.23193783313035965, + "step": 2886 + }, + { + "epoch": 0.1805, + "grad_norm": 3.0, + "grad_norm_var": 0.010856119791666667, + "learning_rate": 0.0001, + "loss": 8.1343, + "loss/crossentropy": 2.2248082160949707, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.26918257772922516, + "step": 2888 + }, + { + "epoch": 0.180625, + "grad_norm": 3.03125, + "grad_norm_var": 0.010334269205729166, + "learning_rate": 0.0001, + "loss": 8.187, + "loss/crossentropy": 2.5334991216659546, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.29171979427337646, + "step": 2890 + }, + { + "epoch": 0.18075, + "grad_norm": 3.109375, + "grad_norm_var": 0.010367838541666667, + "learning_rate": 0.0001, + "loss": 8.2264, + "loss/crossentropy": 2.3791427612304688, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.27077220380306244, + "step": 2892 + }, + { + "epoch": 0.180875, + "grad_norm": 3.21875, + "grad_norm_var": 0.0108306884765625, + "learning_rate": 0.0001, + "loss": 8.0115, + "loss/crossentropy": 2.189573884010315, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.268039345741272, + "step": 2894 + }, + { + "epoch": 0.181, + "grad_norm": 3.015625, + "grad_norm_var": 0.01060791015625, + "learning_rate": 0.0001, + "loss": 8.3331, + "loss/crossentropy": 2.399560272693634, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.24194006621837616, + "step": 2896 + }, + { + "epoch": 0.181125, + "grad_norm": 3.03125, + "grad_norm_var": 0.010184733072916667, + "learning_rate": 0.0001, + "loss": 8.1515, + "loss/crossentropy": 2.140601396560669, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.24887196719646454, + "step": 2898 + }, + { + "epoch": 0.18125, + "grad_norm": 2.75, + "grad_norm_var": 0.014872233072916666, + "learning_rate": 0.0001, + "loss": 7.7282, + "loss/crossentropy": 2.0853304862976074, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2543262392282486, + "step": 2900 + }, + { + "epoch": 0.181375, + "grad_norm": 3.75, + "grad_norm_var": 1.08092041015625, + "learning_rate": 0.0001, + "loss": 8.8119, + "loss/crossentropy": 2.390069603919983, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.3597041964530945, + "step": 2902 + }, + { + "epoch": 0.1815, + "grad_norm": 3.421875, + "grad_norm_var": 1.06201171875, + "learning_rate": 0.0001, + "loss": 8.1467, + "loss/crossentropy": 2.2973451614379883, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.30557607114315033, + "step": 2904 + }, + { + "epoch": 0.181625, + "grad_norm": 3.171875, + "grad_norm_var": 1.0474680582682292, + "learning_rate": 0.0001, + "loss": 8.1066, + "loss/crossentropy": 2.217726707458496, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2568957358598709, + "step": 2906 + }, + { + "epoch": 0.18175, + "grad_norm": 3.484375, + "grad_norm_var": 1.03580322265625, + "learning_rate": 0.0001, + "loss": 8.2859, + "loss/crossentropy": 2.403484344482422, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2820377051830292, + "step": 2908 + }, + { + "epoch": 0.181875, + "grad_norm": 2.96875, + "grad_norm_var": 1.0566721598307292, + "learning_rate": 0.0001, + "loss": 8.1078, + "loss/crossentropy": 2.49627947807312, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2758508771657944, + "step": 2910 + }, + { + "epoch": 0.182, + "grad_norm": 2.921875, + "grad_norm_var": 1.066657511393229, + "learning_rate": 0.0001, + "loss": 8.08, + "loss/crossentropy": 2.4209847450256348, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.276279091835022, + "step": 2912 + }, + { + "epoch": 0.182125, + "grad_norm": 3.03125, + "grad_norm_var": 1.0872233072916666, + "learning_rate": 0.0001, + "loss": 8.0057, + "loss/crossentropy": 2.4141104221343994, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25281232595443726, + "step": 2914 + }, + { + "epoch": 0.18225, + "grad_norm": 3.71875, + "grad_norm_var": 1.044189453125, + "learning_rate": 0.0001, + "loss": 8.4177, + "loss/crossentropy": 2.1968607902526855, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2601237893104553, + "step": 2916 + }, + { + "epoch": 0.182375, + "grad_norm": 4.09375, + "grad_norm_var": 0.12899983723958333, + "learning_rate": 0.0001, + "loss": 8.6519, + "loss/crossentropy": 2.355108380317688, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2877423167228699, + "step": 2918 + }, + { + "epoch": 0.1825, + "grad_norm": 2.984375, + "grad_norm_var": 0.13144429524739584, + "learning_rate": 0.0001, + "loss": 7.8936, + "loss/crossentropy": 2.0170373916625977, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.23654960840940475, + "step": 2920 + }, + { + "epoch": 0.182625, + "grad_norm": 3.1875, + "grad_norm_var": 0.13209228515625, + "learning_rate": 0.0001, + "loss": 8.0005, + "loss/crossentropy": 2.3714324235916138, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24751296639442444, + "step": 2922 + }, + { + "epoch": 0.18275, + "grad_norm": 2.9375, + "grad_norm_var": 0.12609049479166667, + "learning_rate": 0.0001, + "loss": 8.1466, + "loss/crossentropy": 2.2555553913116455, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24768846482038498, + "step": 2924 + }, + { + "epoch": 0.182875, + "grad_norm": 3.09375, + "grad_norm_var": 0.12849019368489584, + "learning_rate": 0.0001, + "loss": 8.0836, + "loss/crossentropy": 1.9109330773353577, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.25811461359262466, + "step": 2926 + }, + { + "epoch": 0.183, + "grad_norm": 3.09375, + "grad_norm_var": 0.13059488932291666, + "learning_rate": 0.0001, + "loss": 8.2019, + "loss/crossentropy": 2.4296613931655884, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2532989978790283, + "step": 2928 + }, + { + "epoch": 0.183125, + "grad_norm": 3.109375, + "grad_norm_var": 0.12439778645833334, + "learning_rate": 0.0001, + "loss": 8.5959, + "loss/crossentropy": 2.5520033836364746, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2584295719861984, + "step": 2930 + }, + { + "epoch": 0.18325, + "grad_norm": 3.234375, + "grad_norm_var": 0.10261128743489584, + "learning_rate": 0.0001, + "loss": 8.167, + "loss/crossentropy": 2.2617307901382446, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.25226905941963196, + "step": 2932 + }, + { + "epoch": 0.183375, + "grad_norm": 3.640625, + "grad_norm_var": 0.05717671712239583, + "learning_rate": 0.0001, + "loss": 8.476, + "loss/crossentropy": 2.2038698196411133, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.27726230025291443, + "step": 2934 + }, + { + "epoch": 0.1835, + "grad_norm": 3.40625, + "grad_norm_var": 0.05496419270833333, + "learning_rate": 0.0001, + "loss": 8.2697, + "loss/crossentropy": 2.011132597923279, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24731668829917908, + "step": 2936 + }, + { + "epoch": 0.183625, + "grad_norm": 3.015625, + "grad_norm_var": 0.05384114583333333, + "learning_rate": 0.0001, + "loss": 8.2291, + "loss/crossentropy": 2.241647481918335, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24874335527420044, + "step": 2938 + }, + { + "epoch": 0.18375, + "grad_norm": 3.640625, + "grad_norm_var": 0.0597076416015625, + "learning_rate": 0.0001, + "loss": 8.1807, + "loss/crossentropy": 2.0755521059036255, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26138997077941895, + "step": 2940 + }, + { + "epoch": 0.183875, + "grad_norm": 3.015625, + "grad_norm_var": 0.05976155598958333, + "learning_rate": 0.0001, + "loss": 8.4186, + "loss/crossentropy": 2.4679603576660156, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.27902379631996155, + "step": 2942 + }, + { + "epoch": 0.184, + "grad_norm": 3.0, + "grad_norm_var": 0.05484619140625, + "learning_rate": 0.0001, + "loss": 8.1271, + "loss/crossentropy": 2.0534247159957886, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26021403074264526, + "step": 2944 + }, + { + "epoch": 0.184125, + "grad_norm": 3.015625, + "grad_norm_var": 0.05119527180989583, + "learning_rate": 0.0001, + "loss": 8.2078, + "loss/crossentropy": 2.2699111700057983, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.26569822430610657, + "step": 2946 + }, + { + "epoch": 0.18425, + "grad_norm": 2.828125, + "grad_norm_var": 0.06213785807291667, + "learning_rate": 0.0001, + "loss": 8.0974, + "loss/crossentropy": 2.420296549797058, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2546275109052658, + "step": 2948 + }, + { + "epoch": 0.184375, + "grad_norm": 3.34375, + "grad_norm_var": 0.04946187337239583, + "learning_rate": 0.0001, + "loss": 8.4089, + "loss/crossentropy": 2.5957722663879395, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.27632173895835876, + "step": 2950 + }, + { + "epoch": 0.1845, + "grad_norm": 3.53125, + "grad_norm_var": 0.05413004557291667, + "learning_rate": 0.0001, + "loss": 8.3382, + "loss/crossentropy": 2.484778642654419, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.27322784066200256, + "step": 2952 + }, + { + "epoch": 0.184625, + "grad_norm": 3.078125, + "grad_norm_var": 0.056473795572916666, + "learning_rate": 0.0001, + "loss": 8.0627, + "loss/crossentropy": 2.148501753807068, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2559093087911606, + "step": 2954 + }, + { + "epoch": 0.18475, + "grad_norm": 3.046875, + "grad_norm_var": 0.040771484375, + "learning_rate": 0.0001, + "loss": 8.3083, + "loss/crossentropy": 2.314954161643982, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.28318941593170166, + "step": 2956 + }, + { + "epoch": 0.184875, + "grad_norm": 3.015625, + "grad_norm_var": 0.03795166015625, + "learning_rate": 0.0001, + "loss": 8.1172, + "loss/crossentropy": 2.366102695465088, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.2760534882545471, + "step": 2958 + }, + { + "epoch": 0.185, + "grad_norm": 3.171875, + "grad_norm_var": 0.03756510416666667, + "learning_rate": 0.0001, + "loss": 8.2888, + "loss/crossentropy": 2.487810730934143, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.2672920525074005, + "step": 2960 + }, + { + "epoch": 0.185125, + "grad_norm": 3.046875, + "grad_norm_var": 0.036554972330729164, + "learning_rate": 0.0001, + "loss": 8.1165, + "loss/crossentropy": 2.235316514968872, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25732211768627167, + "step": 2962 + }, + { + "epoch": 0.18525, + "grad_norm": 2.890625, + "grad_norm_var": 0.033600870768229166, + "learning_rate": 0.0001, + "loss": 8.2153, + "loss/crossentropy": 2.688939690589905, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.31740468740463257, + "step": 2964 + }, + { + "epoch": 0.185375, + "grad_norm": 2.921875, + "grad_norm_var": 0.030248006184895832, + "learning_rate": 0.0001, + "loss": 8.0094, + "loss/crossentropy": 2.5690836906433105, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2810027152299881, + "step": 2966 + }, + { + "epoch": 0.1855, + "grad_norm": 2.953125, + "grad_norm_var": 0.013817342122395833, + "learning_rate": 0.0001, + "loss": 7.8082, + "loss/crossentropy": 2.1851229667663574, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2638121098279953, + "step": 2968 + }, + { + "epoch": 0.185625, + "grad_norm": 3.71875, + "grad_norm_var": 0.044266764322916666, + "learning_rate": 0.0001, + "loss": 8.3646, + "loss/crossentropy": 2.1047242879867554, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.3048202395439148, + "step": 2970 + }, + { + "epoch": 0.18575, + "grad_norm": 2.984375, + "grad_norm_var": 0.0408843994140625, + "learning_rate": 0.0001, + "loss": 8.2758, + "loss/crossentropy": 2.1301698684692383, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.24628406763076782, + "step": 2972 + }, + { + "epoch": 0.185875, + "grad_norm": 2.890625, + "grad_norm_var": 0.038361612955729166, + "learning_rate": 0.0001, + "loss": 8.2939, + "loss/crossentropy": 2.433535099029541, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.29371021687984467, + "step": 2974 + }, + { + "epoch": 0.186, + "grad_norm": 3.0, + "grad_norm_var": 0.0372955322265625, + "learning_rate": 0.0001, + "loss": 8.1541, + "loss/crossentropy": 2.1997573375701904, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2511989399790764, + "step": 2976 + }, + { + "epoch": 0.186125, + "grad_norm": 2.9375, + "grad_norm_var": 0.03798726399739583, + "learning_rate": 0.0001, + "loss": 8.4344, + "loss/crossentropy": 2.4531192779541016, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2771553546190262, + "step": 2978 + }, + { + "epoch": 0.18625, + "grad_norm": 3.15625, + "grad_norm_var": 0.03854166666666667, + "learning_rate": 0.0001, + "loss": 8.0181, + "loss/crossentropy": 2.2669776678085327, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25914834439754486, + "step": 2980 + }, + { + "epoch": 0.186375, + "grad_norm": 2.890625, + "grad_norm_var": 0.048563639322916664, + "learning_rate": 0.0001, + "loss": 8.2711, + "loss/crossentropy": 2.322340726852417, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.28330640494823456, + "step": 2982 + }, + { + "epoch": 0.1865, + "grad_norm": 2.90625, + "grad_norm_var": 0.049540201822916664, + "learning_rate": 0.0001, + "loss": 8.1752, + "loss/crossentropy": 2.351397395133972, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24569445848464966, + "step": 2984 + }, + { + "epoch": 0.186625, + "grad_norm": 2.875, + "grad_norm_var": 0.02564697265625, + "learning_rate": 0.0001, + "loss": 8.1168, + "loss/crossentropy": 2.547055959701538, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.27938394248485565, + "step": 2986 + }, + { + "epoch": 0.18675, + "grad_norm": 2.9375, + "grad_norm_var": 0.026070149739583333, + "learning_rate": 0.0001, + "loss": 8.5189, + "loss/crossentropy": 2.5092413425445557, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2599862962961197, + "step": 2988 + }, + { + "epoch": 0.186875, + "grad_norm": 3.03125, + "grad_norm_var": 0.029427083333333333, + "learning_rate": 0.0001, + "loss": 8.078, + "loss/crossentropy": 2.412800908088684, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2972192317247391, + "step": 2990 + }, + { + "epoch": 0.187, + "grad_norm": 3.28125, + "grad_norm_var": 0.03290608723958333, + "learning_rate": 0.0001, + "loss": 8.4171, + "loss/crossentropy": 2.5122843980789185, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2523321807384491, + "step": 2992 + }, + { + "epoch": 0.187125, + "grad_norm": 3.140625, + "grad_norm_var": 0.04843343098958333, + "learning_rate": 0.0001, + "loss": 8.2715, + "loss/crossentropy": 2.3395785093307495, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2835071235895157, + "step": 2994 + }, + { + "epoch": 0.18725, + "grad_norm": 3.71875, + "grad_norm_var": 0.06779683430989583, + "learning_rate": 0.0001, + "loss": 8.4595, + "loss/crossentropy": 2.5921707153320312, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.32498699426651, + "step": 2996 + }, + { + "epoch": 0.187375, + "grad_norm": 3.453125, + "grad_norm_var": 0.06386617024739584, + "learning_rate": 0.0001, + "loss": 8.0491, + "loss/crossentropy": 2.3352322578430176, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2577902227640152, + "step": 2998 + }, + { + "epoch": 0.1875, + "grad_norm": 3.28125, + "grad_norm_var": 0.05859375, + "learning_rate": 0.0001, + "loss": 8.2248, + "loss/crossentropy": 2.3004229068756104, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.25653262436389923, + "step": 3000 + }, + { + "epoch": 0.187625, + "grad_norm": 2.84375, + "grad_norm_var": 0.0658203125, + "learning_rate": 0.0001, + "loss": 7.8124, + "loss/crossentropy": 2.2351561784744263, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25118912756443024, + "step": 3002 + }, + { + "epoch": 0.18775, + "grad_norm": 3.140625, + "grad_norm_var": 0.06357320149739583, + "learning_rate": 0.0001, + "loss": 8.0213, + "loss/crossentropy": 2.2484039068222046, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.26490475982427597, + "step": 3004 + }, + { + "epoch": 0.187875, + "grad_norm": 3.078125, + "grad_norm_var": 0.06610921223958334, + "learning_rate": 0.0001, + "loss": 8.1035, + "loss/crossentropy": 2.3076168298721313, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2600875496864319, + "step": 3006 + }, + { + "epoch": 0.188, + "grad_norm": 2.9375, + "grad_norm_var": 0.06648661295572916, + "learning_rate": 0.0001, + "loss": 8.2658, + "loss/crossentropy": 2.5785369873046875, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2783546894788742, + "step": 3008 + }, + { + "epoch": 0.188125, + "grad_norm": 3.1875, + "grad_norm_var": 0.05540262858072917, + "learning_rate": 0.0001, + "loss": 8.3027, + "loss/crossentropy": 2.3401262760162354, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2552775740623474, + "step": 3010 + }, + { + "epoch": 0.18825, + "grad_norm": 2.890625, + "grad_norm_var": 0.029296875, + "learning_rate": 0.0001, + "loss": 8.182, + "loss/crossentropy": 2.122409999370575, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.25811081379652023, + "step": 3012 + }, + { + "epoch": 0.188375, + "grad_norm": 3.25, + "grad_norm_var": 0.023368326822916667, + "learning_rate": 0.0001, + "loss": 8.0199, + "loss/crossentropy": 2.427290201187134, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.26948370039463043, + "step": 3014 + }, + { + "epoch": 0.1885, + "grad_norm": 2.984375, + "grad_norm_var": 0.0194732666015625, + "learning_rate": 0.0001, + "loss": 8.3842, + "loss/crossentropy": 2.355746865272522, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2620701193809509, + "step": 3016 + }, + { + "epoch": 0.188625, + "grad_norm": 2.90625, + "grad_norm_var": 0.0181793212890625, + "learning_rate": 0.0001, + "loss": 7.9699, + "loss/crossentropy": 2.35839581489563, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26620611548423767, + "step": 3018 + }, + { + "epoch": 0.18875, + "grad_norm": 2.890625, + "grad_norm_var": 0.019917805989583332, + "learning_rate": 0.0001, + "loss": 8.1946, + "loss/crossentropy": 2.2129684686660767, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2607392221689224, + "step": 3020 + }, + { + "epoch": 0.188875, + "grad_norm": 3.046875, + "grad_norm_var": 0.019091796875, + "learning_rate": 0.0001, + "loss": 8.2211, + "loss/crossentropy": 2.1944016218185425, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24655280262231827, + "step": 3022 + }, + { + "epoch": 0.189, + "grad_norm": 2.875, + "grad_norm_var": 0.0239166259765625, + "learning_rate": 0.0001, + "loss": 8.0436, + "loss/crossentropy": 2.1268292665481567, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2602980434894562, + "step": 3024 + }, + { + "epoch": 0.189125, + "grad_norm": 2.875, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 8.108, + "loss/crossentropy": 2.4339241981506348, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26776036620140076, + "step": 3026 + }, + { + "epoch": 0.18925, + "grad_norm": 2.90625, + "grad_norm_var": 0.022337849934895834, + "learning_rate": 0.0001, + "loss": 8.1682, + "loss/crossentropy": 2.456682324409485, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27592889964580536, + "step": 3028 + }, + { + "epoch": 0.189375, + "grad_norm": 2.78125, + "grad_norm_var": 0.013630167643229166, + "learning_rate": 0.0001, + "loss": 7.9461, + "loss/crossentropy": 2.3544520139694214, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.25780095160007477, + "step": 3030 + }, + { + "epoch": 0.1895, + "grad_norm": 3.09375, + "grad_norm_var": 0.014557902018229167, + "learning_rate": 0.0001, + "loss": 8.1715, + "loss/crossentropy": 2.363596200942993, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.29543258249759674, + "step": 3032 + }, + { + "epoch": 0.189625, + "grad_norm": 3.484375, + "grad_norm_var": 0.03345947265625, + "learning_rate": 0.0001, + "loss": 8.2126, + "loss/crossentropy": 2.4506388902664185, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2730669379234314, + "step": 3034 + }, + { + "epoch": 0.18975, + "grad_norm": 3.15625, + "grad_norm_var": 0.06018778483072917, + "learning_rate": 0.0001, + "loss": 8.2254, + "loss/crossentropy": 2.3496711254119873, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2570537030696869, + "step": 3036 + }, + { + "epoch": 0.189875, + "grad_norm": 3.09375, + "grad_norm_var": 0.061258951822916664, + "learning_rate": 0.0001, + "loss": 8.2739, + "loss/crossentropy": 2.3058911561965942, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2748931795358658, + "step": 3038 + }, + { + "epoch": 0.19, + "grad_norm": 2.90625, + "grad_norm_var": 0.05432942708333333, + "learning_rate": 0.0001, + "loss": 8.2554, + "loss/crossentropy": 2.1980836391448975, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.26495523750782013, + "step": 3040 + }, + { + "epoch": 0.190125, + "grad_norm": 3.421875, + "grad_norm_var": 0.064501953125, + "learning_rate": 0.0001, + "loss": 8.255, + "loss/crossentropy": 2.0824698209762573, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.24418041110038757, + "step": 3042 + }, + { + "epoch": 0.19025, + "grad_norm": 3.359375, + "grad_norm_var": 0.07330729166666666, + "learning_rate": 0.0001, + "loss": 8.3164, + "loss/crossentropy": 2.5387042760849, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2929651141166687, + "step": 3044 + }, + { + "epoch": 0.190375, + "grad_norm": 2.734375, + "grad_norm_var": 0.08557535807291666, + "learning_rate": 0.0001, + "loss": 7.973, + "loss/crossentropy": 2.1048532724380493, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.23939846456050873, + "step": 3046 + }, + { + "epoch": 0.1905, + "grad_norm": 3.140625, + "grad_norm_var": 0.08424072265625, + "learning_rate": 0.0001, + "loss": 8.0964, + "loss/crossentropy": 2.418026924133301, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.27147340774536133, + "step": 3048 + }, + { + "epoch": 0.190625, + "grad_norm": 2.921875, + "grad_norm_var": 0.07753499348958333, + "learning_rate": 0.0001, + "loss": 8.2169, + "loss/crossentropy": 2.720233917236328, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.26460620760917664, + "step": 3050 + }, + { + "epoch": 0.19075, + "grad_norm": 3.203125, + "grad_norm_var": 0.053511555989583334, + "learning_rate": 0.0001, + "loss": 8.2677, + "loss/crossentropy": 2.3085511922836304, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2717476040124893, + "step": 3052 + }, + { + "epoch": 0.190875, + "grad_norm": 3.03125, + "grad_norm_var": 0.05455729166666667, + "learning_rate": 0.0001, + "loss": 8.2355, + "loss/crossentropy": 2.387621521949768, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.27137576043605804, + "step": 3054 + }, + { + "epoch": 0.191, + "grad_norm": 3.0625, + "grad_norm_var": 0.052708943684895836, + "learning_rate": 0.0001, + "loss": 8.2354, + "loss/crossentropy": 2.2664815187454224, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.27788451313972473, + "step": 3056 + }, + { + "epoch": 0.191125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04145406087239583, + "learning_rate": 0.0001, + "loss": 8.1699, + "loss/crossentropy": 2.19650661945343, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2952868938446045, + "step": 3058 + }, + { + "epoch": 0.19125, + "grad_norm": 2.890625, + "grad_norm_var": 0.023433430989583334, + "learning_rate": 0.0001, + "loss": 8.1568, + "loss/crossentropy": 2.2924450635910034, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.26796723902225494, + "step": 3060 + }, + { + "epoch": 0.191375, + "grad_norm": 3.140625, + "grad_norm_var": 0.011197916666666667, + "learning_rate": 0.0001, + "loss": 8.1256, + "loss/crossentropy": 2.516156315803528, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.265813946723938, + "step": 3062 + }, + { + "epoch": 0.1915, + "grad_norm": 3.046875, + "grad_norm_var": 0.010570271809895834, + "learning_rate": 0.0001, + "loss": 8.2824, + "loss/crossentropy": 2.3948129415512085, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.28667983412742615, + "step": 3064 + }, + { + "epoch": 0.191625, + "grad_norm": 3.15625, + "grad_norm_var": 0.010130818684895833, + "learning_rate": 0.0001, + "loss": 8.5698, + "loss/crossentropy": 2.627001643180847, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.302081897854805, + "step": 3066 + }, + { + "epoch": 0.19175, + "grad_norm": 3.3125, + "grad_norm_var": 0.012906901041666667, + "learning_rate": 0.0001, + "loss": 8.2584, + "loss/crossentropy": 2.6413527727127075, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.27230359613895416, + "step": 3068 + }, + { + "epoch": 0.191875, + "grad_norm": 3.03125, + "grad_norm_var": 0.011091105143229167, + "learning_rate": 0.0001, + "loss": 8.2915, + "loss/crossentropy": 2.3797744512557983, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.25851966440677643, + "step": 3070 + }, + { + "epoch": 0.192, + "grad_norm": 3.265625, + "grad_norm_var": 0.013426717122395833, + "learning_rate": 0.0001, + "loss": 8.3002, + "loss/crossentropy": 2.2201138734817505, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2628394812345505, + "step": 3072 + }, + { + "epoch": 0.192125, + "grad_norm": 3.09375, + "grad_norm_var": 0.012555948893229167, + "learning_rate": 0.0001, + "loss": 8.1419, + "loss/crossentropy": 2.0795114636421204, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2654409408569336, + "step": 3074 + }, + { + "epoch": 0.19225, + "grad_norm": 2.75, + "grad_norm_var": 0.0164703369140625, + "learning_rate": 0.0001, + "loss": 7.996, + "loss/crossentropy": 2.2035024166107178, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2541028633713722, + "step": 3076 + }, + { + "epoch": 0.192375, + "grad_norm": 2.828125, + "grad_norm_var": 0.02164306640625, + "learning_rate": 0.0001, + "loss": 8.1765, + "loss/crossentropy": 2.4021114110946655, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2582816928625107, + "step": 3078 + }, + { + "epoch": 0.1925, + "grad_norm": 2.953125, + "grad_norm_var": 0.022679646809895832, + "learning_rate": 0.0001, + "loss": 7.9373, + "loss/crossentropy": 2.2895009517669678, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.256507083773613, + "step": 3080 + }, + { + "epoch": 0.192625, + "grad_norm": 2.8125, + "grad_norm_var": 0.0277252197265625, + "learning_rate": 0.0001, + "loss": 7.8838, + "loss/crossentropy": 2.027153968811035, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.23617319762706757, + "step": 3082 + }, + { + "epoch": 0.19275, + "grad_norm": 3.078125, + "grad_norm_var": 0.0377105712890625, + "learning_rate": 0.0001, + "loss": 8.3783, + "loss/crossentropy": 2.6372686624526978, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2389061003923416, + "step": 3084 + }, + { + "epoch": 0.192875, + "grad_norm": 3.046875, + "grad_norm_var": 0.037369791666666666, + "learning_rate": 0.0001, + "loss": 8.3173, + "loss/crossentropy": 2.4116551876068115, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.27836497128009796, + "step": 3086 + }, + { + "epoch": 0.193, + "grad_norm": 3.046875, + "grad_norm_var": 0.04472554524739583, + "learning_rate": 0.0001, + "loss": 8.0528, + "loss/crossentropy": 2.435407042503357, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25427422672510147, + "step": 3088 + }, + { + "epoch": 0.193125, + "grad_norm": 2.953125, + "grad_norm_var": 0.042769368489583334, + "learning_rate": 0.0001, + "loss": 8.2524, + "loss/crossentropy": 2.6872342824935913, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2897500991821289, + "step": 3090 + }, + { + "epoch": 0.19325, + "grad_norm": 2.734375, + "grad_norm_var": 0.04388020833333333, + "learning_rate": 0.0001, + "loss": 8.1072, + "loss/crossentropy": 2.4015711545944214, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2563893646001816, + "step": 3092 + }, + { + "epoch": 0.193375, + "grad_norm": 3.03125, + "grad_norm_var": 0.04967041015625, + "learning_rate": 0.0001, + "loss": 7.8814, + "loss/crossentropy": 2.3149009943008423, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.254148505628109, + "step": 3094 + }, + { + "epoch": 0.1935, + "grad_norm": 2.671875, + "grad_norm_var": 0.06373291015625, + "learning_rate": 0.0001, + "loss": 7.6511, + "loss/crossentropy": 1.942514955997467, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2268311232328415, + "step": 3096 + }, + { + "epoch": 0.193625, + "grad_norm": 3.15625, + "grad_norm_var": 0.05998942057291667, + "learning_rate": 0.0001, + "loss": 8.1055, + "loss/crossentropy": 2.512845039367676, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.28133875131607056, + "step": 3098 + }, + { + "epoch": 0.19375, + "grad_norm": 2.84375, + "grad_norm_var": 0.047591145833333334, + "learning_rate": 0.0001, + "loss": 8.0487, + "loss/crossentropy": 2.191560924053192, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24201743304729462, + "step": 3100 + }, + { + "epoch": 0.193875, + "grad_norm": 3.65625, + "grad_norm_var": 0.14207356770833332, + "learning_rate": 0.0001, + "loss": 8.2637, + "loss/crossentropy": 2.3829805850982666, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2581256628036499, + "step": 3102 + }, + { + "epoch": 0.194, + "grad_norm": 3.109375, + "grad_norm_var": 0.13280843098958334, + "learning_rate": 0.0001, + "loss": 8.2235, + "loss/crossentropy": 2.425102114677429, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2605956494808197, + "step": 3104 + }, + { + "epoch": 0.194125, + "grad_norm": 3.15625, + "grad_norm_var": 0.1308502197265625, + "learning_rate": 0.0001, + "loss": 8.2458, + "loss/crossentropy": 2.274181604385376, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2674994617700577, + "step": 3106 + }, + { + "epoch": 0.19425, + "grad_norm": 2.96875, + "grad_norm_var": 0.12086181640625, + "learning_rate": 0.0001, + "loss": 8.123, + "loss/crossentropy": 2.2365976572036743, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25523822009563446, + "step": 3108 + }, + { + "epoch": 0.194375, + "grad_norm": 3.328125, + "grad_norm_var": 0.11941731770833333, + "learning_rate": 0.0001, + "loss": 8.0437, + "loss/crossentropy": 2.065447449684143, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2694360166788101, + "step": 3110 + }, + { + "epoch": 0.1945, + "grad_norm": 2.921875, + "grad_norm_var": 0.11741536458333333, + "learning_rate": 0.0001, + "loss": 8.3343, + "loss/crossentropy": 2.4269362688064575, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.25637270510196686, + "step": 3112 + }, + { + "epoch": 0.194625, + "grad_norm": 3.0625, + "grad_norm_var": 0.11825764973958333, + "learning_rate": 0.0001, + "loss": 8.2654, + "loss/crossentropy": 2.4185571670532227, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.26571086049079895, + "step": 3114 + }, + { + "epoch": 0.19475, + "grad_norm": 3.03125, + "grad_norm_var": 0.10660807291666667, + "learning_rate": 0.0001, + "loss": 7.9204, + "loss/crossentropy": 2.3637081384658813, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.25697462260723114, + "step": 3116 + }, + { + "epoch": 0.194875, + "grad_norm": 3.0, + "grad_norm_var": 0.03491923014322917, + "learning_rate": 0.0001, + "loss": 8.1335, + "loss/crossentropy": 2.3663218021392822, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2510784715414047, + "step": 3118 + }, + { + "epoch": 0.195, + "grad_norm": 2.953125, + "grad_norm_var": 0.03645731608072917, + "learning_rate": 0.0001, + "loss": 8.0906, + "loss/crossentropy": 2.4295765161514282, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2629868686199188, + "step": 3120 + }, + { + "epoch": 0.195125, + "grad_norm": 2.90625, + "grad_norm_var": 0.03824462890625, + "learning_rate": 0.0001, + "loss": 8.1089, + "loss/crossentropy": 2.334370255470276, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.25758758932352066, + "step": 3122 + }, + { + "epoch": 0.19525, + "grad_norm": 2.8125, + "grad_norm_var": 0.042313639322916666, + "learning_rate": 0.0001, + "loss": 8.0055, + "loss/crossentropy": 2.156652331352234, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24994631111621857, + "step": 3124 + }, + { + "epoch": 0.195375, + "grad_norm": 2.921875, + "grad_norm_var": 0.0381744384765625, + "learning_rate": 0.0001, + "loss": 8.1386, + "loss/crossentropy": 2.3902939558029175, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2595854699611664, + "step": 3126 + }, + { + "epoch": 0.1955, + "grad_norm": 3.28125, + "grad_norm_var": 0.01337890625, + "learning_rate": 0.0001, + "loss": 8.0796, + "loss/crossentropy": 2.371825695037842, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.26566849648952484, + "step": 3128 + }, + { + "epoch": 0.195625, + "grad_norm": 2.828125, + "grad_norm_var": 0.017235310872395833, + "learning_rate": 0.0001, + "loss": 8.0604, + "loss/crossentropy": 2.0502688884735107, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.24038948118686676, + "step": 3130 + }, + { + "epoch": 0.19575, + "grad_norm": 3.125, + "grad_norm_var": 0.01881103515625, + "learning_rate": 0.0001, + "loss": 8.1111, + "loss/crossentropy": 2.6462838649749756, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.27009811997413635, + "step": 3132 + }, + { + "epoch": 0.195875, + "grad_norm": 3.15625, + "grad_norm_var": 0.02281494140625, + "learning_rate": 0.0001, + "loss": 8.3116, + "loss/crossentropy": 2.3181967735290527, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2602214217185974, + "step": 3134 + }, + { + "epoch": 0.196, + "grad_norm": 3.171875, + "grad_norm_var": 0.022526041666666666, + "learning_rate": 0.0001, + "loss": 8.3065, + "loss/crossentropy": 2.4056873321533203, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26275748014450073, + "step": 3136 + }, + { + "epoch": 0.196125, + "grad_norm": 3.359375, + "grad_norm_var": 0.02760009765625, + "learning_rate": 0.0001, + "loss": 8.1514, + "loss/crossentropy": 2.278952717781067, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2747359275817871, + "step": 3138 + }, + { + "epoch": 0.19625, + "grad_norm": 2.90625, + "grad_norm_var": 0.0237213134765625, + "learning_rate": 0.0001, + "loss": 8.1014, + "loss/crossentropy": 2.417848587036133, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.26167601346969604, + "step": 3140 + }, + { + "epoch": 0.196375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0255279541015625, + "learning_rate": 0.0001, + "loss": 8.2388, + "loss/crossentropy": 2.444987416267395, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25386446714401245, + "step": 3142 + }, + { + "epoch": 0.1965, + "grad_norm": 3.046875, + "grad_norm_var": 0.022705078125, + "learning_rate": 0.0001, + "loss": 8.2145, + "loss/crossentropy": 2.581206440925598, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2710491418838501, + "step": 3144 + }, + { + "epoch": 0.196625, + "grad_norm": 2.828125, + "grad_norm_var": 0.0254058837890625, + "learning_rate": 0.0001, + "loss": 8.0613, + "loss/crossentropy": 2.3687140941619873, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26861967146396637, + "step": 3146 + }, + { + "epoch": 0.19675, + "grad_norm": 2.9375, + "grad_norm_var": 0.07899983723958333, + "learning_rate": 0.0001, + "loss": 8.3689, + "loss/crossentropy": 2.3609704971313477, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.26762421429157257, + "step": 3148 + }, + { + "epoch": 0.196875, + "grad_norm": 2.765625, + "grad_norm_var": 0.08605855305989583, + "learning_rate": 0.0001, + "loss": 7.9722, + "loss/crossentropy": 2.097190797328949, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24098625779151917, + "step": 3150 + }, + { + "epoch": 0.197, + "grad_norm": 3.09375, + "grad_norm_var": 0.0851226806640625, + "learning_rate": 0.0001, + "loss": 8.2039, + "loss/crossentropy": 2.257096529006958, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2598903179168701, + "step": 3152 + }, + { + "epoch": 0.197125, + "grad_norm": 2.875, + "grad_norm_var": 0.08181050618489584, + "learning_rate": 0.0001, + "loss": 8.1718, + "loss/crossentropy": 2.4441792964935303, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2703270763158798, + "step": 3154 + }, + { + "epoch": 0.19725, + "grad_norm": 3.171875, + "grad_norm_var": 0.0839996337890625, + "learning_rate": 0.0001, + "loss": 8.1738, + "loss/crossentropy": 2.337058424949646, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.25680898129940033, + "step": 3156 + }, + { + "epoch": 0.197375, + "grad_norm": 2.96875, + "grad_norm_var": 0.08325093587239583, + "learning_rate": 0.0001, + "loss": 8.1708, + "loss/crossentropy": 2.3203121423721313, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.25183166563510895, + "step": 3158 + }, + { + "epoch": 0.1975, + "grad_norm": 2.875, + "grad_norm_var": 0.08502604166666666, + "learning_rate": 0.0001, + "loss": 7.9677, + "loss/crossentropy": 2.3680754899978638, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2431572675704956, + "step": 3160 + }, + { + "epoch": 0.197625, + "grad_norm": 3.203125, + "grad_norm_var": 0.08133036295572917, + "learning_rate": 0.0001, + "loss": 8.203, + "loss/crossentropy": 2.241086721420288, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.27713850140571594, + "step": 3162 + }, + { + "epoch": 0.19775, + "grad_norm": 2.9375, + "grad_norm_var": 0.025260416666666667, + "learning_rate": 0.0001, + "loss": 8.3118, + "loss/crossentropy": 2.5448096990585327, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2678837478160858, + "step": 3164 + }, + { + "epoch": 0.197875, + "grad_norm": 2.796875, + "grad_norm_var": 0.022956339518229167, + "learning_rate": 0.0001, + "loss": 8.0958, + "loss/crossentropy": 2.4013454914093018, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2578812763094902, + "step": 3166 + }, + { + "epoch": 0.198, + "grad_norm": 3.109375, + "grad_norm_var": 0.0243560791015625, + "learning_rate": 0.0001, + "loss": 8.3649, + "loss/crossentropy": 2.399568200111389, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.3332839608192444, + "step": 3168 + }, + { + "epoch": 0.198125, + "grad_norm": 2.78125, + "grad_norm_var": 0.023726399739583334, + "learning_rate": 0.0001, + "loss": 7.9721, + "loss/crossentropy": 2.496425151824951, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.28880220651626587, + "step": 3170 + }, + { + "epoch": 0.19825, + "grad_norm": 2.953125, + "grad_norm_var": 0.020992024739583334, + "learning_rate": 0.0001, + "loss": 8.2263, + "loss/crossentropy": 2.2559762001037598, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.263287752866745, + "step": 3172 + }, + { + "epoch": 0.198375, + "grad_norm": 3.0, + "grad_norm_var": 0.018294270833333334, + "learning_rate": 0.0001, + "loss": 8.2044, + "loss/crossentropy": 2.4847277402877808, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26287899166345596, + "step": 3174 + }, + { + "epoch": 0.1985, + "grad_norm": 3.109375, + "grad_norm_var": 0.01822509765625, + "learning_rate": 0.0001, + "loss": 8.0561, + "loss/crossentropy": 2.5184192657470703, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.274020254611969, + "step": 3176 + }, + { + "epoch": 0.198625, + "grad_norm": 3.015625, + "grad_norm_var": 0.01279296875, + "learning_rate": 0.0001, + "loss": 8.0441, + "loss/crossentropy": 2.276871681213379, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.23629513382911682, + "step": 3178 + }, + { + "epoch": 0.19875, + "grad_norm": 3.203125, + "grad_norm_var": 0.016014607747395833, + "learning_rate": 0.0001, + "loss": 8.1896, + "loss/crossentropy": 2.236708164215088, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2376183420419693, + "step": 3180 + }, + { + "epoch": 0.198875, + "grad_norm": 2.859375, + "grad_norm_var": 0.0204010009765625, + "learning_rate": 0.0001, + "loss": 7.8184, + "loss/crossentropy": 2.289997696876526, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.26156531274318695, + "step": 3182 + }, + { + "epoch": 0.199, + "grad_norm": 2.890625, + "grad_norm_var": 0.0199859619140625, + "learning_rate": 0.0001, + "loss": 8.3089, + "loss/crossentropy": 2.327690005302429, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2894662618637085, + "step": 3184 + }, + { + "epoch": 0.199125, + "grad_norm": 3.203125, + "grad_norm_var": 0.02037353515625, + "learning_rate": 0.0001, + "loss": 8.0743, + "loss/crossentropy": 2.418406367301941, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.25407615303993225, + "step": 3186 + }, + { + "epoch": 0.19925, + "grad_norm": 2.8125, + "grad_norm_var": 0.0223785400390625, + "learning_rate": 0.0001, + "loss": 8.0573, + "loss/crossentropy": 2.243067741394043, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2523237615823746, + "step": 3188 + }, + { + "epoch": 0.199375, + "grad_norm": 3.078125, + "grad_norm_var": 0.027242024739583332, + "learning_rate": 0.0001, + "loss": 7.8769, + "loss/crossentropy": 2.4334983825683594, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.23196329176425934, + "step": 3190 + }, + { + "epoch": 0.1995, + "grad_norm": 2.65625, + "grad_norm_var": 0.035868326822916664, + "learning_rate": 0.0001, + "loss": 7.9538, + "loss/crossentropy": 2.311228036880493, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2579493597149849, + "step": 3192 + }, + { + "epoch": 0.199625, + "grad_norm": 3.140625, + "grad_norm_var": 0.03538309733072917, + "learning_rate": 0.0001, + "loss": 8.2196, + "loss/crossentropy": 2.431540369987488, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.25777457654476166, + "step": 3194 + }, + { + "epoch": 0.19975, + "grad_norm": 2.953125, + "grad_norm_var": 0.03191731770833333, + "learning_rate": 0.0001, + "loss": 8.139, + "loss/crossentropy": 2.1621546745300293, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2560386508703232, + "step": 3196 + }, + { + "epoch": 0.199875, + "grad_norm": 2.828125, + "grad_norm_var": 0.025712076822916666, + "learning_rate": 0.0001, + "loss": 7.97, + "loss/crossentropy": 2.1923757791519165, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2589757889509201, + "step": 3198 + }, + { + "epoch": 0.2, + "grad_norm": 2.875, + "grad_norm_var": 0.0247467041015625, + "learning_rate": 0.0001, + "loss": 8.0392, + "loss/crossentropy": 2.3970407247543335, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2541755437850952, + "step": 3200 + }, + { + "epoch": 0.200125, + "grad_norm": 2.8125, + "grad_norm_var": 0.020406087239583332, + "learning_rate": 0.0001, + "loss": 8.0207, + "loss/crossentropy": 2.2216036319732666, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.26332978904247284, + "step": 3202 + }, + { + "epoch": 0.20025, + "grad_norm": 2.875, + "grad_norm_var": 0.02056884765625, + "learning_rate": 0.0001, + "loss": 8.2514, + "loss/crossentropy": 2.2952345609664917, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24460511654615402, + "step": 3204 + }, + { + "epoch": 0.200375, + "grad_norm": 3.171875, + "grad_norm_var": 0.027497355143229166, + "learning_rate": 0.0001, + "loss": 8.3172, + "loss/crossentropy": 2.3329302072525024, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.3068646192550659, + "step": 3206 + }, + { + "epoch": 0.2005, + "grad_norm": 3.0, + "grad_norm_var": 0.018464152018229166, + "learning_rate": 0.0001, + "loss": 8.0853, + "loss/crossentropy": 2.4682726860046387, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2747759073972702, + "step": 3208 + }, + { + "epoch": 0.200625, + "grad_norm": 2.90625, + "grad_norm_var": 0.015555826822916667, + "learning_rate": 0.0001, + "loss": 8.084, + "loss/crossentropy": 2.385040760040283, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2688244730234146, + "step": 3210 + }, + { + "epoch": 0.20075, + "grad_norm": 2.984375, + "grad_norm_var": 0.01578369140625, + "learning_rate": 0.0001, + "loss": 8.2028, + "loss/crossentropy": 2.3609360456466675, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2741740494966507, + "step": 3212 + }, + { + "epoch": 0.200875, + "grad_norm": 3.078125, + "grad_norm_var": 0.015946451822916666, + "learning_rate": 0.0001, + "loss": 8.1162, + "loss/crossentropy": 2.268782615661621, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.25281913578510284, + "step": 3214 + }, + { + "epoch": 0.201, + "grad_norm": 2.90625, + "grad_norm_var": 0.0151519775390625, + "learning_rate": 0.0001, + "loss": 7.9198, + "loss/crossentropy": 2.2365881204605103, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.24488338828086853, + "step": 3216 + }, + { + "epoch": 0.201125, + "grad_norm": 3.125, + "grad_norm_var": 0.014134724934895834, + "learning_rate": 0.0001, + "loss": 8.0622, + "loss/crossentropy": 2.333191156387329, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2846767157316208, + "step": 3218 + }, + { + "epoch": 0.20125, + "grad_norm": 2.984375, + "grad_norm_var": 0.014925130208333333, + "learning_rate": 0.0001, + "loss": 7.765, + "loss/crossentropy": 2.2031763792037964, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2630941718816757, + "step": 3220 + }, + { + "epoch": 0.201375, + "grad_norm": 2.953125, + "grad_norm_var": 0.007515462239583334, + "learning_rate": 0.0001, + "loss": 7.8986, + "loss/crossentropy": 2.228626847267151, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24427320063114166, + "step": 3222 + }, + { + "epoch": 0.2015, + "grad_norm": 3.0625, + "grad_norm_var": 0.007710774739583333, + "learning_rate": 0.0001, + "loss": 7.8828, + "loss/crossentropy": 2.3146544694900513, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.26063594222068787, + "step": 3224 + }, + { + "epoch": 0.201625, + "grad_norm": 2.921875, + "grad_norm_var": 0.0090484619140625, + "learning_rate": 0.0001, + "loss": 8.0466, + "loss/crossentropy": 2.5943726301193237, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2648574709892273, + "step": 3226 + }, + { + "epoch": 0.20175, + "grad_norm": 2.75, + "grad_norm_var": 0.0123687744140625, + "learning_rate": 0.0001, + "loss": 8.1293, + "loss/crossentropy": 2.481694221496582, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26219360530376434, + "step": 3228 + }, + { + "epoch": 0.201875, + "grad_norm": 2.9375, + "grad_norm_var": 0.010578409830729166, + "learning_rate": 0.0001, + "loss": 8.0384, + "loss/crossentropy": 2.2695223093032837, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.27425651252269745, + "step": 3230 + }, + { + "epoch": 0.202, + "grad_norm": 2.75, + "grad_norm_var": 0.013736979166666666, + "learning_rate": 0.0001, + "loss": 8.0473, + "loss/crossentropy": 2.2828463315963745, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.25569504499435425, + "step": 3232 + }, + { + "epoch": 0.202125, + "grad_norm": 3.0625, + "grad_norm_var": 0.012450154622395833, + "learning_rate": 0.0001, + "loss": 8.0475, + "loss/crossentropy": 2.402729630470276, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.26215188205242157, + "step": 3234 + }, + { + "epoch": 0.20225, + "grad_norm": 2.953125, + "grad_norm_var": 0.0115142822265625, + "learning_rate": 0.0001, + "loss": 8.1686, + "loss/crossentropy": 2.3639482259750366, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2587997019290924, + "step": 3236 + }, + { + "epoch": 0.202375, + "grad_norm": 2.96875, + "grad_norm_var": 0.012044270833333334, + "learning_rate": 0.0001, + "loss": 8.0236, + "loss/crossentropy": 2.2139264345169067, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25370609760284424, + "step": 3238 + }, + { + "epoch": 0.2025, + "grad_norm": 2.765625, + "grad_norm_var": 0.01025390625, + "learning_rate": 0.0001, + "loss": 7.8858, + "loss/crossentropy": 2.37356698513031, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2569102644920349, + "step": 3240 + }, + { + "epoch": 0.202625, + "grad_norm": 3.09375, + "grad_norm_var": 0.015013631184895833, + "learning_rate": 0.0001, + "loss": 8.0907, + "loss/crossentropy": 2.1355791091918945, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2584020048379898, + "step": 3242 + }, + { + "epoch": 0.20275, + "grad_norm": 2.890625, + "grad_norm_var": 0.013963826497395833, + "learning_rate": 0.0001, + "loss": 7.9375, + "loss/crossentropy": 2.1258978247642517, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2648678421974182, + "step": 3244 + }, + { + "epoch": 0.202875, + "grad_norm": 2.9375, + "grad_norm_var": 0.014383951822916666, + "learning_rate": 0.0001, + "loss": 8.0241, + "loss/crossentropy": 2.2715872526168823, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2862909138202667, + "step": 3246 + }, + { + "epoch": 0.203, + "grad_norm": 3.0, + "grad_norm_var": 0.018473307291666668, + "learning_rate": 0.0001, + "loss": 8.1095, + "loss/crossentropy": 2.333642363548279, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2708228975534439, + "step": 3248 + }, + { + "epoch": 0.203125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0251953125, + "learning_rate": 0.0001, + "loss": 8.3236, + "loss/crossentropy": 2.464186906814575, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2748962640762329, + "step": 3250 + }, + { + "epoch": 0.20325, + "grad_norm": 3.09375, + "grad_norm_var": 0.0252838134765625, + "learning_rate": 0.0001, + "loss": 8.289, + "loss/crossentropy": 2.0890082120895386, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24715138971805573, + "step": 3252 + }, + { + "epoch": 0.203375, + "grad_norm": 2.765625, + "grad_norm_var": 0.0266998291015625, + "learning_rate": 0.0001, + "loss": 7.9402, + "loss/crossentropy": 2.181369960308075, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.26574352383613586, + "step": 3254 + }, + { + "epoch": 0.2035, + "grad_norm": 3.140625, + "grad_norm_var": 0.02701416015625, + "learning_rate": 0.0001, + "loss": 7.8531, + "loss/crossentropy": 2.281398892402649, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2439633458852768, + "step": 3256 + }, + { + "epoch": 0.203625, + "grad_norm": 2.953125, + "grad_norm_var": 0.0259918212890625, + "learning_rate": 0.0001, + "loss": 7.9964, + "loss/crossentropy": 2.1536207795143127, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.24516429007053375, + "step": 3258 + }, + { + "epoch": 0.20375, + "grad_norm": 3.15625, + "grad_norm_var": 0.026155598958333335, + "learning_rate": 0.0001, + "loss": 8.0773, + "loss/crossentropy": 2.070763051509857, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2518838942050934, + "step": 3260 + }, + { + "epoch": 0.203875, + "grad_norm": 3.71875, + "grad_norm_var": 0.05364176432291667, + "learning_rate": 0.0001, + "loss": 8.0644, + "loss/crossentropy": 2.411260724067688, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2600061446428299, + "step": 3262 + }, + { + "epoch": 0.204, + "grad_norm": 2.859375, + "grad_norm_var": 0.05373942057291667, + "learning_rate": 0.0001, + "loss": 8.1634, + "loss/crossentropy": 2.3903297185897827, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.248751699924469, + "step": 3264 + }, + { + "epoch": 0.204125, + "grad_norm": 3.03125, + "grad_norm_var": 0.052179972330729164, + "learning_rate": 0.0001, + "loss": 8.2074, + "loss/crossentropy": 2.1826690435409546, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2685079574584961, + "step": 3266 + }, + { + "epoch": 0.20425, + "grad_norm": 3.109375, + "grad_norm_var": 0.05537007649739583, + "learning_rate": 0.0001, + "loss": 8.2012, + "loss/crossentropy": 2.207249701023102, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2473110854625702, + "step": 3268 + }, + { + "epoch": 0.204375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0469146728515625, + "learning_rate": 0.0001, + "loss": 7.9611, + "loss/crossentropy": 2.4302347898483276, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24606862664222717, + "step": 3270 + }, + { + "epoch": 0.2045, + "grad_norm": 2.890625, + "grad_norm_var": 0.043309529622395836, + "learning_rate": 0.0001, + "loss": 8.1377, + "loss/crossentropy": 2.538500189781189, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.27223630249500275, + "step": 3272 + }, + { + "epoch": 0.204625, + "grad_norm": 2.90625, + "grad_norm_var": 0.049559529622395834, + "learning_rate": 0.0001, + "loss": 7.8854, + "loss/crossentropy": 2.048095464706421, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24504344165325165, + "step": 3274 + }, + { + "epoch": 0.20475, + "grad_norm": 2.8125, + "grad_norm_var": 0.053076171875, + "learning_rate": 0.0001, + "loss": 7.9718, + "loss/crossentropy": 2.0703362226486206, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.2712066099047661, + "step": 3276 + }, + { + "epoch": 0.204875, + "grad_norm": 3.78125, + "grad_norm_var": 0.05878804524739583, + "learning_rate": 0.0001, + "loss": 8.423, + "loss/crossentropy": 2.4606130123138428, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.31335532665252686, + "step": 3278 + }, + { + "epoch": 0.205, + "grad_norm": 3.03125, + "grad_norm_var": 0.05712890625, + "learning_rate": 0.0001, + "loss": 8.1598, + "loss/crossentropy": 2.6250627040863037, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.27594996988773346, + "step": 3280 + }, + { + "epoch": 0.205125, + "grad_norm": 2.859375, + "grad_norm_var": 0.056818644205729164, + "learning_rate": 0.0001, + "loss": 8.2019, + "loss/crossentropy": 2.4209065437316895, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.25507232546806335, + "step": 3282 + }, + { + "epoch": 0.20525, + "grad_norm": 2.953125, + "grad_norm_var": 0.05328369140625, + "learning_rate": 0.0001, + "loss": 8.1579, + "loss/crossentropy": 2.3270163536071777, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24912425875663757, + "step": 3284 + }, + { + "epoch": 0.205375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0549957275390625, + "learning_rate": 0.0001, + "loss": 7.9852, + "loss/crossentropy": 2.4678841829299927, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23587578535079956, + "step": 3286 + }, + { + "epoch": 0.2055, + "grad_norm": 3.0, + "grad_norm_var": 0.051167805989583336, + "learning_rate": 0.0001, + "loss": 8.105, + "loss/crossentropy": 2.420538544654846, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2402632236480713, + "step": 3288 + }, + { + "epoch": 0.205625, + "grad_norm": 2.796875, + "grad_norm_var": 0.05025634765625, + "learning_rate": 0.0001, + "loss": 8.0947, + "loss/crossentropy": 2.5115219354629517, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.25969623029232025, + "step": 3290 + }, + { + "epoch": 0.20575, + "grad_norm": 3.078125, + "grad_norm_var": 0.04841206868489583, + "learning_rate": 0.0001, + "loss": 8.242, + "loss/crossentropy": 2.2653130292892456, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2700078934431076, + "step": 3292 + }, + { + "epoch": 0.205875, + "grad_norm": 3.5625, + "grad_norm_var": 0.0318756103515625, + "learning_rate": 0.0001, + "loss": 8.3417, + "loss/crossentropy": 2.481539011001587, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2482079267501831, + "step": 3294 + }, + { + "epoch": 0.206, + "grad_norm": 2.875, + "grad_norm_var": 0.056550089518229166, + "learning_rate": 0.0001, + "loss": 8.1301, + "loss/crossentropy": 2.377937436103821, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.252384252846241, + "step": 3296 + }, + { + "epoch": 0.206125, + "grad_norm": 2.984375, + "grad_norm_var": 0.054488118489583334, + "learning_rate": 0.0001, + "loss": 8.2856, + "loss/crossentropy": 2.3734689950942993, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25039682537317276, + "step": 3298 + }, + { + "epoch": 0.20625, + "grad_norm": 2.84375, + "grad_norm_var": 0.05681966145833333, + "learning_rate": 0.0001, + "loss": 7.9778, + "loss/crossentropy": 2.237556576728821, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.23857159167528152, + "step": 3300 + }, + { + "epoch": 0.206375, + "grad_norm": 3.046875, + "grad_norm_var": 0.0544342041015625, + "learning_rate": 0.0001, + "loss": 7.984, + "loss/crossentropy": 2.286033868789673, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2503318265080452, + "step": 3302 + }, + { + "epoch": 0.2065, + "grad_norm": 2.96875, + "grad_norm_var": 0.06236063639322917, + "learning_rate": 0.0001, + "loss": 8.1663, + "loss/crossentropy": 2.111438810825348, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.26578497141599655, + "step": 3304 + }, + { + "epoch": 0.206625, + "grad_norm": 2.78125, + "grad_norm_var": 0.06298421223958334, + "learning_rate": 0.0001, + "loss": 8.047, + "loss/crossentropy": 2.0786361694335938, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2384149432182312, + "step": 3306 + }, + { + "epoch": 0.20675, + "grad_norm": 2.9375, + "grad_norm_var": 0.0692047119140625, + "learning_rate": 0.0001, + "loss": 7.8227, + "loss/crossentropy": 2.0808927416801453, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2648046165704727, + "step": 3308 + }, + { + "epoch": 0.206875, + "grad_norm": 2.9375, + "grad_norm_var": 0.05110575358072917, + "learning_rate": 0.0001, + "loss": 8.0012, + "loss/crossentropy": 2.1514230966567993, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24816033244132996, + "step": 3310 + }, + { + "epoch": 0.207, + "grad_norm": 2.921875, + "grad_norm_var": 0.023502604166666666, + "learning_rate": 0.0001, + "loss": 7.9675, + "loss/crossentropy": 2.114220142364502, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.22722011804580688, + "step": 3312 + }, + { + "epoch": 0.207125, + "grad_norm": 3.015625, + "grad_norm_var": 0.026220703125, + "learning_rate": 0.0001, + "loss": 7.8712, + "loss/crossentropy": 2.1372103095054626, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2645547240972519, + "step": 3314 + }, + { + "epoch": 0.20725, + "grad_norm": 2.84375, + "grad_norm_var": 0.0248931884765625, + "learning_rate": 0.0001, + "loss": 8.0588, + "loss/crossentropy": 2.372753381729126, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.25332552194595337, + "step": 3316 + }, + { + "epoch": 0.207375, + "grad_norm": 3.0625, + "grad_norm_var": 0.027049763997395834, + "learning_rate": 0.0001, + "loss": 8.1708, + "loss/crossentropy": 2.311069369316101, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2605705112218857, + "step": 3318 + }, + { + "epoch": 0.2075, + "grad_norm": 3.140625, + "grad_norm_var": 0.017943318684895834, + "learning_rate": 0.0001, + "loss": 8.1425, + "loss/crossentropy": 2.4386746883392334, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2544742077589035, + "step": 3320 + }, + { + "epoch": 0.207625, + "grad_norm": 3.25, + "grad_norm_var": 0.020750935872395834, + "learning_rate": 0.0001, + "loss": 7.968, + "loss/crossentropy": 2.1591333150863647, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24341313540935516, + "step": 3322 + }, + { + "epoch": 0.20775, + "grad_norm": 2.90625, + "grad_norm_var": 0.018583170572916665, + "learning_rate": 0.0001, + "loss": 8.2134, + "loss/crossentropy": 2.328689455986023, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.29594168066978455, + "step": 3324 + }, + { + "epoch": 0.207875, + "grad_norm": 3.140625, + "grad_norm_var": 0.023661295572916668, + "learning_rate": 0.0001, + "loss": 7.9217, + "loss/crossentropy": 2.1437469720840454, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24640139937400818, + "step": 3326 + }, + { + "epoch": 0.208, + "grad_norm": 3.125, + "grad_norm_var": 0.0243560791015625, + "learning_rate": 0.0001, + "loss": 8.0628, + "loss/crossentropy": 2.4024256467819214, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2733878195285797, + "step": 3328 + }, + { + "epoch": 0.208125, + "grad_norm": 2.921875, + "grad_norm_var": 0.025386555989583334, + "learning_rate": 0.0001, + "loss": 7.8516, + "loss/crossentropy": 2.1901514530181885, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23804646730422974, + "step": 3330 + }, + { + "epoch": 0.20825, + "grad_norm": 2.90625, + "grad_norm_var": 0.024828084309895835, + "learning_rate": 0.0001, + "loss": 8.109, + "loss/crossentropy": 2.291813850402832, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2448124885559082, + "step": 3332 + }, + { + "epoch": 0.208375, + "grad_norm": 2.921875, + "grad_norm_var": 0.025406901041666666, + "learning_rate": 0.0001, + "loss": 7.9948, + "loss/crossentropy": 2.341397523880005, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.27015161514282227, + "step": 3334 + }, + { + "epoch": 0.2085, + "grad_norm": 3.140625, + "grad_norm_var": 0.022761027018229168, + "learning_rate": 0.0001, + "loss": 8.1965, + "loss/crossentropy": 2.3533178567886353, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.25063496828079224, + "step": 3336 + }, + { + "epoch": 0.208625, + "grad_norm": 2.65625, + "grad_norm_var": 0.03072509765625, + "learning_rate": 0.0001, + "loss": 8.1318, + "loss/crossentropy": 2.0992120504379272, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2553541362285614, + "step": 3338 + }, + { + "epoch": 0.20875, + "grad_norm": 3.09375, + "grad_norm_var": 0.033665974934895836, + "learning_rate": 0.0001, + "loss": 8.0182, + "loss/crossentropy": 2.382994294166565, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.28810542821884155, + "step": 3340 + }, + { + "epoch": 0.208875, + "grad_norm": 2.96875, + "grad_norm_var": 0.027562459309895832, + "learning_rate": 0.0001, + "loss": 7.9908, + "loss/crossentropy": 2.1971789598464966, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2596438080072403, + "step": 3342 + }, + { + "epoch": 0.209, + "grad_norm": 3.234375, + "grad_norm_var": 0.030029296875, + "learning_rate": 0.0001, + "loss": 7.7718, + "loss/crossentropy": 2.0156781673431396, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.23752722889184952, + "step": 3344 + }, + { + "epoch": 0.209125, + "grad_norm": 2.984375, + "grad_norm_var": 0.026123046875, + "learning_rate": 0.0001, + "loss": 7.9132, + "loss/crossentropy": 2.2873259782791138, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.26303017139434814, + "step": 3346 + }, + { + "epoch": 0.20925, + "grad_norm": 2.84375, + "grad_norm_var": 0.026854451497395834, + "learning_rate": 0.0001, + "loss": 8.1614, + "loss/crossentropy": 2.208059072494507, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25297391414642334, + "step": 3348 + }, + { + "epoch": 0.209375, + "grad_norm": 3.046875, + "grad_norm_var": 0.026432291666666666, + "learning_rate": 0.0001, + "loss": 8.2134, + "loss/crossentropy": 2.664340019226074, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2778072953224182, + "step": 3350 + }, + { + "epoch": 0.2095, + "grad_norm": 3.125, + "grad_norm_var": 0.025951131184895834, + "learning_rate": 0.0001, + "loss": 7.9863, + "loss/crossentropy": 2.352488398551941, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.280301034450531, + "step": 3352 + }, + { + "epoch": 0.209625, + "grad_norm": 2.9375, + "grad_norm_var": 0.013109334309895833, + "learning_rate": 0.0001, + "loss": 7.9529, + "loss/crossentropy": 2.3037261962890625, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2568385750055313, + "step": 3354 + }, + { + "epoch": 0.20975, + "grad_norm": 2.953125, + "grad_norm_var": 0.009175618489583334, + "learning_rate": 0.0001, + "loss": 8.0492, + "loss/crossentropy": 2.264007806777954, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2372257262468338, + "step": 3356 + }, + { + "epoch": 0.209875, + "grad_norm": 2.875, + "grad_norm_var": 0.043187459309895836, + "learning_rate": 0.0001, + "loss": 8.1985, + "loss/crossentropy": 2.265676975250244, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.30991341173648834, + "step": 3358 + }, + { + "epoch": 0.21, + "grad_norm": 3.203125, + "grad_norm_var": 0.04296875, + "learning_rate": 0.0001, + "loss": 8.0834, + "loss/crossentropy": 2.170192003250122, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2553609609603882, + "step": 3360 + }, + { + "epoch": 0.210125, + "grad_norm": 2.96875, + "grad_norm_var": 0.04361572265625, + "learning_rate": 0.0001, + "loss": 8.091, + "loss/crossentropy": 2.4134016036987305, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2930755317211151, + "step": 3362 + }, + { + "epoch": 0.21025, + "grad_norm": 3.796875, + "grad_norm_var": 0.07550455729166666, + "learning_rate": 0.0001, + "loss": 8.1486, + "loss/crossentropy": 2.3027660846710205, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.2863481193780899, + "step": 3364 + }, + { + "epoch": 0.210375, + "grad_norm": 2.90625, + "grad_norm_var": 0.07553609212239583, + "learning_rate": 0.0001, + "loss": 7.9561, + "loss/crossentropy": 2.3249882459640503, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25326602160930634, + "step": 3366 + }, + { + "epoch": 0.2105, + "grad_norm": 2.953125, + "grad_norm_var": 0.07991129557291667, + "learning_rate": 0.0001, + "loss": 8.1748, + "loss/crossentropy": 2.429360032081604, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2706481069326401, + "step": 3368 + }, + { + "epoch": 0.210625, + "grad_norm": 3.109375, + "grad_norm_var": 0.08212890625, + "learning_rate": 0.0001, + "loss": 8.0211, + "loss/crossentropy": 2.304826259613037, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2733375281095505, + "step": 3370 + }, + { + "epoch": 0.21075, + "grad_norm": 3.453125, + "grad_norm_var": 0.10440165201822917, + "learning_rate": 0.0001, + "loss": 8.5313, + "loss/crossentropy": 2.6415737867355347, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.28427866101264954, + "step": 3372 + }, + { + "epoch": 0.210875, + "grad_norm": 3.390625, + "grad_norm_var": 0.08263346354166666, + "learning_rate": 0.0001, + "loss": 8.1368, + "loss/crossentropy": 2.3548909425735474, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.25943076610565186, + "step": 3374 + }, + { + "epoch": 0.211, + "grad_norm": 2.984375, + "grad_norm_var": 0.086669921875, + "learning_rate": 0.0001, + "loss": 8.057, + "loss/crossentropy": 2.2008095383644104, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2705947160720825, + "step": 3376 + }, + { + "epoch": 0.211125, + "grad_norm": 3.375, + "grad_norm_var": 0.08528238932291667, + "learning_rate": 0.0001, + "loss": 8.1684, + "loss/crossentropy": 2.1185187101364136, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.23934519290924072, + "step": 3378 + }, + { + "epoch": 0.21125, + "grad_norm": 2.890625, + "grad_norm_var": 0.06855061848958334, + "learning_rate": 0.0001, + "loss": 7.9323, + "loss/crossentropy": 2.183789014816284, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23662084341049194, + "step": 3380 + }, + { + "epoch": 0.211375, + "grad_norm": 3.515625, + "grad_norm_var": 0.07534077962239584, + "learning_rate": 0.0001, + "loss": 8.3151, + "loss/crossentropy": 2.4148218631744385, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.2555703818798065, + "step": 3382 + }, + { + "epoch": 0.2115, + "grad_norm": 3.078125, + "grad_norm_var": 0.06721089680989584, + "learning_rate": 0.0001, + "loss": 8.1862, + "loss/crossentropy": 2.5065032243728638, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.28839460015296936, + "step": 3384 + }, + { + "epoch": 0.211625, + "grad_norm": 3.125, + "grad_norm_var": 0.06750895182291666, + "learning_rate": 0.0001, + "loss": 8.0797, + "loss/crossentropy": 2.2872482538223267, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2494898959994316, + "step": 3386 + }, + { + "epoch": 0.21175, + "grad_norm": 2.734375, + "grad_norm_var": 0.04421284993489583, + "learning_rate": 0.0001, + "loss": 8.0956, + "loss/crossentropy": 2.249394178390503, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.23356334120035172, + "step": 3388 + }, + { + "epoch": 0.211875, + "grad_norm": 2.953125, + "grad_norm_var": 0.04309488932291667, + "learning_rate": 0.0001, + "loss": 8.1253, + "loss/crossentropy": 2.4072612524032593, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2705316096544266, + "step": 3390 + }, + { + "epoch": 0.212, + "grad_norm": 3.015625, + "grad_norm_var": 0.04153645833333333, + "learning_rate": 0.0001, + "loss": 8.1955, + "loss/crossentropy": 2.291887044906616, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2566347047686577, + "step": 3392 + }, + { + "epoch": 0.212125, + "grad_norm": 3.015625, + "grad_norm_var": 0.03092041015625, + "learning_rate": 0.0001, + "loss": 8.1182, + "loss/crossentropy": 2.4818975925445557, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.27201250195503235, + "step": 3394 + }, + { + "epoch": 0.21225, + "grad_norm": 2.9375, + "grad_norm_var": 0.029426066080729167, + "learning_rate": 0.0001, + "loss": 8.1119, + "loss/crossentropy": 2.3387409448623657, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.26730673760175705, + "step": 3396 + }, + { + "epoch": 0.212375, + "grad_norm": 3.125, + "grad_norm_var": 0.014013671875, + "learning_rate": 0.0001, + "loss": 8.0371, + "loss/crossentropy": 2.4898757934570312, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.2796122133731842, + "step": 3398 + }, + { + "epoch": 0.2125, + "grad_norm": 2.96875, + "grad_norm_var": 0.0157623291015625, + "learning_rate": 0.0001, + "loss": 7.9214, + "loss/crossentropy": 2.3467652797698975, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25767359137535095, + "step": 3400 + }, + { + "epoch": 0.212625, + "grad_norm": 2.859375, + "grad_norm_var": 0.014891560872395833, + "learning_rate": 0.0001, + "loss": 8.2623, + "loss/crossentropy": 2.23625385761261, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2665071487426758, + "step": 3402 + }, + { + "epoch": 0.21275, + "grad_norm": 2.90625, + "grad_norm_var": 0.010221354166666667, + "learning_rate": 0.0001, + "loss": 8.0001, + "loss/crossentropy": 2.247014105319977, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24841443449258804, + "step": 3404 + }, + { + "epoch": 0.212875, + "grad_norm": 2.875, + "grad_norm_var": 0.012886555989583333, + "learning_rate": 0.0001, + "loss": 8.0313, + "loss/crossentropy": 2.3653587102890015, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.27540309727191925, + "step": 3406 + }, + { + "epoch": 0.213, + "grad_norm": 3.171875, + "grad_norm_var": 0.0152740478515625, + "learning_rate": 0.0001, + "loss": 8.2313, + "loss/crossentropy": 2.2608449459075928, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2475418895483017, + "step": 3408 + }, + { + "epoch": 0.213125, + "grad_norm": 2.875, + "grad_norm_var": 0.0187896728515625, + "learning_rate": 0.0001, + "loss": 8.2868, + "loss/crossentropy": 1.9663435816764832, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.27278994023799896, + "step": 3410 + }, + { + "epoch": 0.21325, + "grad_norm": 3.0, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 0.0001, + "loss": 8.1122, + "loss/crossentropy": 2.391019344329834, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25076402723789215, + "step": 3412 + }, + { + "epoch": 0.213375, + "grad_norm": 2.84375, + "grad_norm_var": 0.017210896809895834, + "learning_rate": 0.0001, + "loss": 7.9714, + "loss/crossentropy": 2.4968901872634888, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.28580130636692047, + "step": 3414 + }, + { + "epoch": 0.2135, + "grad_norm": 3.109375, + "grad_norm_var": 0.019234212239583333, + "learning_rate": 0.0001, + "loss": 7.8824, + "loss/crossentropy": 2.0557892322540283, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24151277542114258, + "step": 3416 + }, + { + "epoch": 0.213625, + "grad_norm": 2.890625, + "grad_norm_var": 0.019627888997395832, + "learning_rate": 0.0001, + "loss": 8.0965, + "loss/crossentropy": 2.443650245666504, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.25222641229629517, + "step": 3418 + }, + { + "epoch": 0.21375, + "grad_norm": 2.90625, + "grad_norm_var": 0.022419230143229166, + "learning_rate": 0.0001, + "loss": 8.2063, + "loss/crossentropy": 2.750308632850647, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.2700583189725876, + "step": 3420 + }, + { + "epoch": 0.213875, + "grad_norm": 3.03125, + "grad_norm_var": 0.01842041015625, + "learning_rate": 0.0001, + "loss": 8.051, + "loss/crossentropy": 2.1019209027290344, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24724262952804565, + "step": 3422 + }, + { + "epoch": 0.214, + "grad_norm": 2.875, + "grad_norm_var": 0.015360514322916666, + "learning_rate": 0.0001, + "loss": 7.9942, + "loss/crossentropy": 2.328813672065735, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2349339872598648, + "step": 3424 + }, + { + "epoch": 0.214125, + "grad_norm": 2.671875, + "grad_norm_var": 0.013932291666666667, + "learning_rate": 0.0001, + "loss": 7.8529, + "loss/crossentropy": 2.2752292156219482, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.25418810546398163, + "step": 3426 + }, + { + "epoch": 0.21425, + "grad_norm": 2.96875, + "grad_norm_var": 0.0168365478515625, + "learning_rate": 0.0001, + "loss": 7.9415, + "loss/crossentropy": 2.260936737060547, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.25729209184646606, + "step": 3428 + }, + { + "epoch": 0.214375, + "grad_norm": 2.890625, + "grad_norm_var": 0.0165191650390625, + "learning_rate": 0.0001, + "loss": 8.1508, + "loss/crossentropy": 2.213426113128662, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2391301691532135, + "step": 3430 + }, + { + "epoch": 0.2145, + "grad_norm": 2.96875, + "grad_norm_var": 0.013020833333333334, + "learning_rate": 0.0001, + "loss": 8.0022, + "loss/crossentropy": 2.320794105529785, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24946201592683792, + "step": 3432 + }, + { + "epoch": 0.214625, + "grad_norm": 2.8125, + "grad_norm_var": 0.0132476806640625, + "learning_rate": 0.0001, + "loss": 8.0841, + "loss/crossentropy": 2.2941734790802, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24938707053661346, + "step": 3434 + }, + { + "epoch": 0.21475, + "grad_norm": 3.03125, + "grad_norm_var": 0.011551920572916667, + "learning_rate": 0.0001, + "loss": 8.1231, + "loss/crossentropy": 2.1613428592681885, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.231092631816864, + "step": 3436 + }, + { + "epoch": 0.214875, + "grad_norm": 2.765625, + "grad_norm_var": 0.015067545572916667, + "learning_rate": 0.0001, + "loss": 7.8562, + "loss/crossentropy": 2.3163094520568848, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24684642255306244, + "step": 3438 + }, + { + "epoch": 0.215, + "grad_norm": 2.921875, + "grad_norm_var": 0.014615885416666667, + "learning_rate": 0.0001, + "loss": 8.1419, + "loss/crossentropy": 2.346468925476074, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2459520548582077, + "step": 3440 + }, + { + "epoch": 0.215125, + "grad_norm": 3.078125, + "grad_norm_var": 0.031473795572916664, + "learning_rate": 0.0001, + "loss": 7.9344, + "loss/crossentropy": 2.299746036529541, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2519551143050194, + "step": 3442 + }, + { + "epoch": 0.21525, + "grad_norm": 2.671875, + "grad_norm_var": 0.03622639973958333, + "learning_rate": 0.0001, + "loss": 8.0242, + "loss/crossentropy": 2.2825082540512085, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24382151663303375, + "step": 3444 + }, + { + "epoch": 0.215375, + "grad_norm": 2.921875, + "grad_norm_var": 0.036799112955729164, + "learning_rate": 0.0001, + "loss": 8.1511, + "loss/crossentropy": 2.724141240119934, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26099613308906555, + "step": 3446 + }, + { + "epoch": 0.2155, + "grad_norm": 2.75, + "grad_norm_var": 0.041356404622395836, + "learning_rate": 0.0001, + "loss": 8.01, + "loss/crossentropy": 2.4468116760253906, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2531883865594864, + "step": 3448 + }, + { + "epoch": 0.215625, + "grad_norm": 3.109375, + "grad_norm_var": 0.04243062337239583, + "learning_rate": 0.0001, + "loss": 8.3209, + "loss/crossentropy": 2.35987651348114, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.2925818860530853, + "step": 3450 + }, + { + "epoch": 0.21575, + "grad_norm": 2.96875, + "grad_norm_var": 0.04248758951822917, + "learning_rate": 0.0001, + "loss": 8.2423, + "loss/crossentropy": 2.2506964802742004, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2657378166913986, + "step": 3452 + }, + { + "epoch": 0.215875, + "grad_norm": 3.0, + "grad_norm_var": 0.0366363525390625, + "learning_rate": 0.0001, + "loss": 7.8316, + "loss/crossentropy": 2.4656643867492676, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2636348456144333, + "step": 3454 + }, + { + "epoch": 0.216, + "grad_norm": 3.046875, + "grad_norm_var": 0.03827718098958333, + "learning_rate": 0.0001, + "loss": 7.9689, + "loss/crossentropy": 2.1773873567581177, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2358511984348297, + "step": 3456 + }, + { + "epoch": 0.216125, + "grad_norm": 2.984375, + "grad_norm_var": 0.019514973958333334, + "learning_rate": 0.0001, + "loss": 8.2903, + "loss/crossentropy": 2.4382131099700928, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26277345418930054, + "step": 3458 + }, + { + "epoch": 0.21625, + "grad_norm": 2.84375, + "grad_norm_var": 0.028514607747395834, + "learning_rate": 0.0001, + "loss": 7.986, + "loss/crossentropy": 2.137218475341797, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2463318258523941, + "step": 3460 + }, + { + "epoch": 0.216375, + "grad_norm": 2.78125, + "grad_norm_var": 0.03186442057291667, + "learning_rate": 0.0001, + "loss": 8.0746, + "loss/crossentropy": 2.5652899742126465, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24091246724128723, + "step": 3462 + }, + { + "epoch": 0.2165, + "grad_norm": 2.953125, + "grad_norm_var": 0.0273101806640625, + "learning_rate": 0.0001, + "loss": 8.2663, + "loss/crossentropy": 2.269050359725952, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.24550612270832062, + "step": 3464 + }, + { + "epoch": 0.216625, + "grad_norm": 3.109375, + "grad_norm_var": 0.0274078369140625, + "learning_rate": 0.0001, + "loss": 8.1145, + "loss/crossentropy": 2.1748660802841187, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2580345571041107, + "step": 3466 + }, + { + "epoch": 0.21675, + "grad_norm": 2.75, + "grad_norm_var": 0.03177083333333333, + "learning_rate": 0.0001, + "loss": 7.7683, + "loss/crossentropy": 2.464895486831665, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.26822274923324585, + "step": 3468 + }, + { + "epoch": 0.216875, + "grad_norm": 3.015625, + "grad_norm_var": 0.03211161295572917, + "learning_rate": 0.0001, + "loss": 8.0132, + "loss/crossentropy": 2.516330361366272, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.27802979946136475, + "step": 3470 + }, + { + "epoch": 0.217, + "grad_norm": 2.953125, + "grad_norm_var": 0.0298980712890625, + "learning_rate": 0.0001, + "loss": 8.0509, + "loss/crossentropy": 2.593145251274109, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26215869188308716, + "step": 3472 + }, + { + "epoch": 0.217125, + "grad_norm": 2.8125, + "grad_norm_var": 0.029781087239583334, + "learning_rate": 0.0001, + "loss": 7.9716, + "loss/crossentropy": 2.226263165473938, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.26481927931308746, + "step": 3474 + }, + { + "epoch": 0.21725, + "grad_norm": 2.953125, + "grad_norm_var": 0.013939412434895833, + "learning_rate": 0.0001, + "loss": 7.9203, + "loss/crossentropy": 2.1908382177352905, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.24750825762748718, + "step": 3476 + }, + { + "epoch": 0.217375, + "grad_norm": 2.75, + "grad_norm_var": 0.018876139322916666, + "learning_rate": 0.0001, + "loss": 8.1505, + "loss/crossentropy": 2.1081652641296387, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24459867179393768, + "step": 3478 + }, + { + "epoch": 0.2175, + "grad_norm": 2.9375, + "grad_norm_var": 0.019136555989583335, + "learning_rate": 0.0001, + "loss": 7.9132, + "loss/crossentropy": 2.07977694272995, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2468918189406395, + "step": 3480 + }, + { + "epoch": 0.217625, + "grad_norm": 2.765625, + "grad_norm_var": 0.018778483072916668, + "learning_rate": 0.0001, + "loss": 8.036, + "loss/crossentropy": 2.2569565773010254, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2362232357263565, + "step": 3482 + }, + { + "epoch": 0.21775, + "grad_norm": 2.859375, + "grad_norm_var": 0.016999308268229166, + "learning_rate": 0.0001, + "loss": 8.0578, + "loss/crossentropy": 2.3093096017837524, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2618033364415169, + "step": 3484 + }, + { + "epoch": 0.217875, + "grad_norm": 2.765625, + "grad_norm_var": 0.018245442708333334, + "learning_rate": 0.0001, + "loss": 7.9073, + "loss/crossentropy": 2.1073737144470215, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23664266616106033, + "step": 3486 + }, + { + "epoch": 0.218, + "grad_norm": 11.875, + "grad_norm_var": 7.317122395833334, + "learning_rate": 0.0001, + "loss": 8.6386, + "loss/crossentropy": 2.440091371536255, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2737603783607483, + "step": 3488 + }, + { + "epoch": 0.218125, + "grad_norm": 3.125, + "grad_norm_var": 7.408426920572917, + "learning_rate": 0.0001, + "loss": 8.3179, + "loss/crossentropy": 2.4426426887512207, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2780514657497406, + "step": 3490 + }, + { + "epoch": 0.21825, + "grad_norm": 3.046875, + "grad_norm_var": 7.375365193684896, + "learning_rate": 0.0001, + "loss": 8.0277, + "loss/crossentropy": 2.196107029914856, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25197281688451767, + "step": 3492 + }, + { + "epoch": 0.218375, + "grad_norm": 3.140625, + "grad_norm_var": 7.33541259765625, + "learning_rate": 0.0001, + "loss": 8.0933, + "loss/crossentropy": 2.404140591621399, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.24803149700164795, + "step": 3494 + }, + { + "epoch": 0.2185, + "grad_norm": 2.890625, + "grad_norm_var": 7.330557250976563, + "learning_rate": 0.0001, + "loss": 7.9468, + "loss/crossentropy": 2.261552095413208, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2754479944705963, + "step": 3496 + }, + { + "epoch": 0.218625, + "grad_norm": 3.21875, + "grad_norm_var": 7.257470703125, + "learning_rate": 0.0001, + "loss": 8.0821, + "loss/crossentropy": 2.354046583175659, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2597518563270569, + "step": 3498 + }, + { + "epoch": 0.21875, + "grad_norm": 6.40625, + "grad_norm_var": 7.410123697916666, + "learning_rate": 0.0001, + "loss": 8.6704, + "loss/crossentropy": 2.2524830102920532, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.26117394119501114, + "step": 3500 + }, + { + "epoch": 0.218875, + "grad_norm": 3.203125, + "grad_norm_var": 7.230793253580729, + "learning_rate": 0.0001, + "loss": 8.2819, + "loss/crossentropy": 2.433029532432556, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.26937395334243774, + "step": 3502 + }, + { + "epoch": 0.219, + "grad_norm": 3.125, + "grad_norm_var": 1.079613240559896, + "learning_rate": 0.0001, + "loss": 8.0829, + "loss/crossentropy": 2.180580735206604, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.24801570177078247, + "step": 3504 + }, + { + "epoch": 0.219125, + "grad_norm": 3.15625, + "grad_norm_var": 0.7117472330729167, + "learning_rate": 0.0001, + "loss": 8.0049, + "loss/crossentropy": 2.497164011001587, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2396157830953598, + "step": 3506 + }, + { + "epoch": 0.21925, + "grad_norm": 3.046875, + "grad_norm_var": 0.70777587890625, + "learning_rate": 0.0001, + "loss": 8.0631, + "loss/crossentropy": 2.0866791009902954, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2625848799943924, + "step": 3508 + }, + { + "epoch": 0.219375, + "grad_norm": 3.234375, + "grad_norm_var": 0.70172119140625, + "learning_rate": 0.0001, + "loss": 8.0452, + "loss/crossentropy": 2.189347505569458, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.23898432403802872, + "step": 3510 + }, + { + "epoch": 0.2195, + "grad_norm": 2.875, + "grad_norm_var": 0.7069986979166667, + "learning_rate": 0.0001, + "loss": 8.1909, + "loss/crossentropy": 2.3433092832565308, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2604905217885971, + "step": 3512 + }, + { + "epoch": 0.219625, + "grad_norm": 2.796875, + "grad_norm_var": 0.7274241129557292, + "learning_rate": 0.0001, + "loss": 7.9611, + "loss/crossentropy": 2.4440083503723145, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2595008611679077, + "step": 3514 + }, + { + "epoch": 0.21975, + "grad_norm": 3.09375, + "grad_norm_var": 0.04013264973958333, + "learning_rate": 0.0001, + "loss": 8.2323, + "loss/crossentropy": 2.4784871339797974, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.2541923224925995, + "step": 3516 + }, + { + "epoch": 0.219875, + "grad_norm": 3.109375, + "grad_norm_var": 0.0164947509765625, + "learning_rate": 0.0001, + "loss": 7.9576, + "loss/crossentropy": 2.083495259284973, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24065294116735458, + "step": 3518 + }, + { + "epoch": 0.22, + "grad_norm": 3.0, + "grad_norm_var": 0.015965779622395832, + "learning_rate": 0.0001, + "loss": 7.9361, + "loss/crossentropy": 2.15146005153656, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24030451476573944, + "step": 3520 + }, + { + "epoch": 0.220125, + "grad_norm": 3.03125, + "grad_norm_var": 0.014582316080729166, + "learning_rate": 0.0001, + "loss": 8.107, + "loss/crossentropy": 2.2797966599464417, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2358318492770195, + "step": 3522 + }, + { + "epoch": 0.22025, + "grad_norm": 3.09375, + "grad_norm_var": 0.019071451822916665, + "learning_rate": 0.0001, + "loss": 8.0834, + "loss/crossentropy": 2.270912528038025, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2688527777791023, + "step": 3524 + }, + { + "epoch": 0.220375, + "grad_norm": 2.765625, + "grad_norm_var": 0.021044921875, + "learning_rate": 0.0001, + "loss": 7.9002, + "loss/crossentropy": 2.1109871864318848, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24296989291906357, + "step": 3526 + }, + { + "epoch": 0.2205, + "grad_norm": 3.734375, + "grad_norm_var": 0.0573883056640625, + "learning_rate": 0.0001, + "loss": 8.0665, + "loss/crossentropy": 2.3173556327819824, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2529396265745163, + "step": 3528 + }, + { + "epoch": 0.220625, + "grad_norm": 3.078125, + "grad_norm_var": 0.05663655598958333, + "learning_rate": 0.0001, + "loss": 8.0593, + "loss/crossentropy": 2.168402671813965, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.26267802715301514, + "step": 3530 + }, + { + "epoch": 0.22075, + "grad_norm": 2.828125, + "grad_norm_var": 0.0636871337890625, + "learning_rate": 0.0001, + "loss": 8.2203, + "loss/crossentropy": 2.3692712783813477, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2798602133989334, + "step": 3532 + }, + { + "epoch": 0.220875, + "grad_norm": 2.578125, + "grad_norm_var": 0.07527567545572916, + "learning_rate": 0.0001, + "loss": 7.9934, + "loss/crossentropy": 2.441710114479065, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.264100581407547, + "step": 3534 + }, + { + "epoch": 0.221, + "grad_norm": 2.84375, + "grad_norm_var": 0.07697652180989584, + "learning_rate": 0.0001, + "loss": 7.914, + "loss/crossentropy": 2.2376210689544678, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2386428266763687, + "step": 3536 + }, + { + "epoch": 0.221125, + "grad_norm": 2.78125, + "grad_norm_var": 0.08683268229166667, + "learning_rate": 0.0001, + "loss": 7.8692, + "loss/crossentropy": 1.9457404017448425, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23729050904512405, + "step": 3538 + }, + { + "epoch": 0.22125, + "grad_norm": 2.875, + "grad_norm_var": 0.07551167805989584, + "learning_rate": 0.0001, + "loss": 7.9995, + "loss/crossentropy": 2.2962979078292847, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2636758089065552, + "step": 3540 + }, + { + "epoch": 0.221375, + "grad_norm": 2.890625, + "grad_norm_var": 0.07558492024739584, + "learning_rate": 0.0001, + "loss": 7.9346, + "loss/crossentropy": 2.275332808494568, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2384483963251114, + "step": 3542 + }, + { + "epoch": 0.2215, + "grad_norm": 2.875, + "grad_norm_var": 0.027839152018229167, + "learning_rate": 0.0001, + "loss": 7.6944, + "loss/crossentropy": 2.0947870016098022, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.22623080015182495, + "step": 3544 + }, + { + "epoch": 0.221625, + "grad_norm": 2.859375, + "grad_norm_var": 0.025633748372395834, + "learning_rate": 0.0001, + "loss": 7.7702, + "loss/crossentropy": 2.4045172929763794, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2514096647500992, + "step": 3546 + }, + { + "epoch": 0.22175, + "grad_norm": 2.921875, + "grad_norm_var": 0.0135894775390625, + "learning_rate": 0.0001, + "loss": 7.8777, + "loss/crossentropy": 2.2690885066986084, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.23157186806201935, + "step": 3548 + }, + { + "epoch": 0.221875, + "grad_norm": 2.6875, + "grad_norm_var": 0.02906494140625, + "learning_rate": 0.0001, + "loss": 8.1106, + "loss/crossentropy": 2.4599485397338867, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.28053322434425354, + "step": 3550 + }, + { + "epoch": 0.222, + "grad_norm": 2.8125, + "grad_norm_var": 0.028880818684895834, + "learning_rate": 0.0001, + "loss": 7.9334, + "loss/crossentropy": 2.15872859954834, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.24894960224628448, + "step": 3552 + }, + { + "epoch": 0.222125, + "grad_norm": 2.875, + "grad_norm_var": 0.024486287434895834, + "learning_rate": 0.0001, + "loss": 7.8626, + "loss/crossentropy": 2.3765709400177, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25589361786842346, + "step": 3554 + }, + { + "epoch": 0.22225, + "grad_norm": 2.84375, + "grad_norm_var": 0.0244293212890625, + "learning_rate": 0.0001, + "loss": 7.9958, + "loss/crossentropy": 2.0382995009422302, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.23948778212070465, + "step": 3556 + }, + { + "epoch": 0.222375, + "grad_norm": 2.984375, + "grad_norm_var": 0.022981770833333335, + "learning_rate": 0.0001, + "loss": 7.9355, + "loss/crossentropy": 2.2267855405807495, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23568203300237656, + "step": 3558 + }, + { + "epoch": 0.2225, + "grad_norm": 3.109375, + "grad_norm_var": 0.025846354166666665, + "learning_rate": 0.0001, + "loss": 8.0736, + "loss/crossentropy": 2.304799437522888, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23179854452610016, + "step": 3560 + }, + { + "epoch": 0.222625, + "grad_norm": 2.8125, + "grad_norm_var": 0.028206380208333333, + "learning_rate": 0.0001, + "loss": 8.1224, + "loss/crossentropy": 2.3973569869995117, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.258654460310936, + "step": 3562 + }, + { + "epoch": 0.22275, + "grad_norm": 2.875, + "grad_norm_var": 0.027132161458333335, + "learning_rate": 0.0001, + "loss": 7.8143, + "loss/crossentropy": 2.097848653793335, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23939958959817886, + "step": 3564 + }, + { + "epoch": 0.222875, + "grad_norm": 3.234375, + "grad_norm_var": 0.018822224934895833, + "learning_rate": 0.0001, + "loss": 7.9652, + "loss/crossentropy": 2.5855683088302612, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2576940581202507, + "step": 3566 + }, + { + "epoch": 0.223, + "grad_norm": 3.0, + "grad_norm_var": 0.01822509765625, + "learning_rate": 0.0001, + "loss": 8.0738, + "loss/crossentropy": 2.328821897506714, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.25657960772514343, + "step": 3568 + }, + { + "epoch": 0.223125, + "grad_norm": 2.828125, + "grad_norm_var": 0.022997029622395835, + "learning_rate": 0.0001, + "loss": 8.0868, + "loss/crossentropy": 2.537997841835022, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.26182398200035095, + "step": 3570 + }, + { + "epoch": 0.22325, + "grad_norm": 2.671875, + "grad_norm_var": 0.027057902018229166, + "learning_rate": 0.0001, + "loss": 7.8351, + "loss/crossentropy": 2.1498693227767944, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.25888827443122864, + "step": 3572 + }, + { + "epoch": 0.223375, + "grad_norm": 3.15625, + "grad_norm_var": 0.03264058430989583, + "learning_rate": 0.0001, + "loss": 7.9163, + "loss/crossentropy": 2.3035428524017334, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2618688642978668, + "step": 3574 + }, + { + "epoch": 0.2235, + "grad_norm": 3.015625, + "grad_norm_var": 0.03328348795572917, + "learning_rate": 0.0001, + "loss": 8.1395, + "loss/crossentropy": 2.457966685295105, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2607010751962662, + "step": 3576 + }, + { + "epoch": 0.223625, + "grad_norm": 3.234375, + "grad_norm_var": 0.04112040201822917, + "learning_rate": 0.0001, + "loss": 8.1893, + "loss/crossentropy": 2.220232129096985, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2506686598062515, + "step": 3578 + }, + { + "epoch": 0.22375, + "grad_norm": 2.765625, + "grad_norm_var": 0.0400787353515625, + "learning_rate": 0.0001, + "loss": 8.0102, + "loss/crossentropy": 2.3712148666381836, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2428571730852127, + "step": 3580 + }, + { + "epoch": 0.223875, + "grad_norm": 2.65625, + "grad_norm_var": 0.04485270182291667, + "learning_rate": 0.0001, + "loss": 7.8827, + "loss/crossentropy": 2.2805765867233276, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23625147342681885, + "step": 3582 + }, + { + "epoch": 0.224, + "grad_norm": 2.78125, + "grad_norm_var": 0.048371378580729166, + "learning_rate": 0.0001, + "loss": 8.0278, + "loss/crossentropy": 2.3268240690231323, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2626963108778, + "step": 3584 + }, + { + "epoch": 0.224125, + "grad_norm": 2.984375, + "grad_norm_var": 0.04468485514322917, + "learning_rate": 0.0001, + "loss": 7.8847, + "loss/crossentropy": 2.4000160694122314, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2558213621377945, + "step": 3586 + }, + { + "epoch": 0.22425, + "grad_norm": 3.109375, + "grad_norm_var": 0.0423492431640625, + "learning_rate": 0.0001, + "loss": 7.9188, + "loss/crossentropy": 2.296359062194824, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24004730582237244, + "step": 3588 + }, + { + "epoch": 0.224375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03669331868489583, + "learning_rate": 0.0001, + "loss": 7.9116, + "loss/crossentropy": 2.4102399349212646, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24954771995544434, + "step": 3590 + }, + { + "epoch": 0.2245, + "grad_norm": 2.78125, + "grad_norm_var": 0.035660807291666666, + "learning_rate": 0.0001, + "loss": 7.9391, + "loss/crossentropy": 2.33876371383667, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2509802132844925, + "step": 3592 + }, + { + "epoch": 0.224625, + "grad_norm": 2.890625, + "grad_norm_var": 0.0248046875, + "learning_rate": 0.0001, + "loss": 7.8352, + "loss/crossentropy": 2.33649480342865, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2264355644583702, + "step": 3594 + }, + { + "epoch": 0.22475, + "grad_norm": 3.140625, + "grad_norm_var": 0.027718098958333333, + "learning_rate": 0.0001, + "loss": 8.0294, + "loss/crossentropy": 2.255256175994873, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24728095531463623, + "step": 3596 + }, + { + "epoch": 0.224875, + "grad_norm": 3.0, + "grad_norm_var": 0.03717447916666667, + "learning_rate": 0.0001, + "loss": 8.3306, + "loss/crossentropy": 2.1616681814193726, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.3028740808367729, + "step": 3598 + }, + { + "epoch": 0.225, + "grad_norm": 3.203125, + "grad_norm_var": 0.033665974934895836, + "learning_rate": 0.0001, + "loss": 8.1381, + "loss/crossentropy": 2.1879382133483887, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.24906174838542938, + "step": 3600 + }, + { + "epoch": 0.225125, + "grad_norm": 2.71875, + "grad_norm_var": 0.0383209228515625, + "learning_rate": 0.0001, + "loss": 8.2209, + "loss/crossentropy": 2.551230788230896, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.26665643602609634, + "step": 3602 + }, + { + "epoch": 0.22525, + "grad_norm": 3.359375, + "grad_norm_var": 0.04551493326822917, + "learning_rate": 0.0001, + "loss": 8.3031, + "loss/crossentropy": 2.5071710348129272, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.342557817697525, + "step": 3604 + }, + { + "epoch": 0.225375, + "grad_norm": 2.90625, + "grad_norm_var": 0.047098795572916664, + "learning_rate": 0.0001, + "loss": 8.1181, + "loss/crossentropy": 2.30819833278656, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.26556289196014404, + "step": 3606 + }, + { + "epoch": 0.2255, + "grad_norm": 2.890625, + "grad_norm_var": 0.0525054931640625, + "learning_rate": 0.0001, + "loss": 7.7835, + "loss/crossentropy": 2.093214511871338, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2401185780763626, + "step": 3608 + }, + { + "epoch": 0.225625, + "grad_norm": 3.015625, + "grad_norm_var": 0.047265625, + "learning_rate": 0.0001, + "loss": 8.0272, + "loss/crossentropy": 2.3199344873428345, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2687990739941597, + "step": 3610 + }, + { + "epoch": 0.22575, + "grad_norm": 3.21875, + "grad_norm_var": 0.05095113118489583, + "learning_rate": 0.0001, + "loss": 8.0956, + "loss/crossentropy": 2.178806185722351, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.22464393079280853, + "step": 3612 + }, + { + "epoch": 0.225875, + "grad_norm": 2.9375, + "grad_norm_var": 0.0380035400390625, + "learning_rate": 0.0001, + "loss": 8.2848, + "loss/crossentropy": 2.8881205320358276, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.2964523136615753, + "step": 3614 + }, + { + "epoch": 0.226, + "grad_norm": 3.5, + "grad_norm_var": 0.0517486572265625, + "learning_rate": 0.0001, + "loss": 8.1152, + "loss/crossentropy": 2.3062459230422974, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2544550150632858, + "step": 3616 + }, + { + "epoch": 0.226125, + "grad_norm": 2.765625, + "grad_norm_var": 0.050812784830729166, + "learning_rate": 0.0001, + "loss": 7.9502, + "loss/crossentropy": 2.1567277312278748, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25140413641929626, + "step": 3618 + }, + { + "epoch": 0.22625, + "grad_norm": 2.921875, + "grad_norm_var": 0.04439697265625, + "learning_rate": 0.0001, + "loss": 7.8556, + "loss/crossentropy": 2.313141703605652, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24740488827228546, + "step": 3620 + }, + { + "epoch": 0.226375, + "grad_norm": 2.890625, + "grad_norm_var": 0.04332275390625, + "learning_rate": 0.0001, + "loss": 8.0423, + "loss/crossentropy": 2.328591823577881, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26608438789844513, + "step": 3622 + }, + { + "epoch": 0.2265, + "grad_norm": 3.125, + "grad_norm_var": 0.0403472900390625, + "learning_rate": 0.0001, + "loss": 8.0185, + "loss/crossentropy": 2.4062294960021973, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.25196973979473114, + "step": 3624 + }, + { + "epoch": 0.226625, + "grad_norm": 3.046875, + "grad_norm_var": 0.03852437337239583, + "learning_rate": 0.0001, + "loss": 8.2194, + "loss/crossentropy": 2.3326817750930786, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25500936061143875, + "step": 3626 + }, + { + "epoch": 0.22675, + "grad_norm": 2.84375, + "grad_norm_var": 0.039876302083333336, + "learning_rate": 0.0001, + "loss": 8.0569, + "loss/crossentropy": 2.482856869697571, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25555209815502167, + "step": 3628 + }, + { + "epoch": 0.226875, + "grad_norm": 3.53125, + "grad_norm_var": 0.10530598958333333, + "learning_rate": 0.0001, + "loss": 8.2217, + "loss/crossentropy": 2.392806649208069, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2713817358016968, + "step": 3630 + }, + { + "epoch": 0.227, + "grad_norm": 2.96875, + "grad_norm_var": 0.0933746337890625, + "learning_rate": 0.0001, + "loss": 8.2214, + "loss/crossentropy": 2.4297484159469604, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.26915179938077927, + "step": 3632 + }, + { + "epoch": 0.227125, + "grad_norm": 2.96875, + "grad_norm_var": 0.09046223958333334, + "learning_rate": 0.0001, + "loss": 7.892, + "loss/crossentropy": 2.0668236017227173, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.24264214932918549, + "step": 3634 + }, + { + "epoch": 0.22725, + "grad_norm": 2.8125, + "grad_norm_var": 0.0952056884765625, + "learning_rate": 0.0001, + "loss": 7.6419, + "loss/crossentropy": 2.2886130809783936, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2411990761756897, + "step": 3636 + }, + { + "epoch": 0.227375, + "grad_norm": 2.765625, + "grad_norm_var": 0.1007476806640625, + "learning_rate": 0.0001, + "loss": 7.8926, + "loss/crossentropy": 2.4050437211990356, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.248046413064003, + "step": 3638 + }, + { + "epoch": 0.2275, + "grad_norm": 3.015625, + "grad_norm_var": 0.0963775634765625, + "learning_rate": 0.0001, + "loss": 7.9465, + "loss/crossentropy": 2.237168073654175, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2606538310647011, + "step": 3640 + }, + { + "epoch": 0.227625, + "grad_norm": 2.96875, + "grad_norm_var": 0.09270426432291666, + "learning_rate": 0.0001, + "loss": 8.0894, + "loss/crossentropy": 2.1759636998176575, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.25069190561771393, + "step": 3642 + }, + { + "epoch": 0.22775, + "grad_norm": 2.75, + "grad_norm_var": 0.09306640625, + "learning_rate": 0.0001, + "loss": 7.8278, + "loss/crossentropy": 2.33109974861145, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24534446746110916, + "step": 3644 + }, + { + "epoch": 0.227875, + "grad_norm": 2.8125, + "grad_norm_var": 0.010319010416666666, + "learning_rate": 0.0001, + "loss": 7.9532, + "loss/crossentropy": 2.209414005279541, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.23883101344108582, + "step": 3646 + }, + { + "epoch": 0.228, + "grad_norm": 2.9375, + "grad_norm_var": 0.009993489583333333, + "learning_rate": 0.0001, + "loss": 8.1503, + "loss/crossentropy": 2.3681305646896362, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2580679953098297, + "step": 3648 + }, + { + "epoch": 0.228125, + "grad_norm": 2.890625, + "grad_norm_var": 0.011913045247395834, + "learning_rate": 0.0001, + "loss": 7.9858, + "loss/crossentropy": 2.2001166343688965, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2430475950241089, + "step": 3650 + }, + { + "epoch": 0.22825, + "grad_norm": 3.0, + "grad_norm_var": 0.011800130208333334, + "learning_rate": 0.0001, + "loss": 7.9813, + "loss/crossentropy": 2.2274385690689087, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.23911988735198975, + "step": 3652 + }, + { + "epoch": 0.228375, + "grad_norm": 2.84375, + "grad_norm_var": 0.010380045572916666, + "learning_rate": 0.0001, + "loss": 8.0302, + "loss/crossentropy": 2.230368971824646, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24802518635988235, + "step": 3654 + }, + { + "epoch": 0.2285, + "grad_norm": 2.6875, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 7.7304, + "loss/crossentropy": 2.195298194885254, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.23306956887245178, + "step": 3656 + }, + { + "epoch": 0.228625, + "grad_norm": 2.90625, + "grad_norm_var": 0.014061482747395833, + "learning_rate": 0.0001, + "loss": 8.0813, + "loss/crossentropy": 2.1692891120910645, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.24116355180740356, + "step": 3658 + }, + { + "epoch": 0.22875, + "grad_norm": 3.125, + "grad_norm_var": 0.017292277018229166, + "learning_rate": 0.0001, + "loss": 8.1278, + "loss/crossentropy": 2.3059465885162354, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24064010381698608, + "step": 3660 + }, + { + "epoch": 0.228875, + "grad_norm": 2.90625, + "grad_norm_var": 0.018192545572916666, + "learning_rate": 0.0001, + "loss": 7.9778, + "loss/crossentropy": 2.399103045463562, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.23807506263256073, + "step": 3662 + }, + { + "epoch": 0.229, + "grad_norm": 2.828125, + "grad_norm_var": 0.01832275390625, + "learning_rate": 0.0001, + "loss": 8.1474, + "loss/crossentropy": 2.3898919820785522, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.25592923909425735, + "step": 3664 + }, + { + "epoch": 0.229125, + "grad_norm": 2.71875, + "grad_norm_var": 0.0172515869140625, + "learning_rate": 0.0001, + "loss": 7.8636, + "loss/crossentropy": 2.329254150390625, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.2536974400281906, + "step": 3666 + }, + { + "epoch": 0.22925, + "grad_norm": 2.65625, + "grad_norm_var": 0.018648274739583335, + "learning_rate": 0.0001, + "loss": 7.8664, + "loss/crossentropy": 2.1764838695526123, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2520892173051834, + "step": 3668 + }, + { + "epoch": 0.229375, + "grad_norm": 2.78125, + "grad_norm_var": 0.027025349934895835, + "learning_rate": 0.0001, + "loss": 7.9033, + "loss/crossentropy": 2.326913833618164, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.22852397710084915, + "step": 3670 + }, + { + "epoch": 0.2295, + "grad_norm": 2.8125, + "grad_norm_var": 0.025251261393229165, + "learning_rate": 0.0001, + "loss": 8.1325, + "loss/crossentropy": 2.5875282287597656, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.25728514790534973, + "step": 3672 + }, + { + "epoch": 0.229625, + "grad_norm": 3.125, + "grad_norm_var": 0.025536092122395833, + "learning_rate": 0.0001, + "loss": 8.1942, + "loss/crossentropy": 2.4060288667678833, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2609986811876297, + "step": 3674 + }, + { + "epoch": 0.22975, + "grad_norm": 2.859375, + "grad_norm_var": 0.021361287434895834, + "learning_rate": 0.0001, + "loss": 7.9882, + "loss/crossentropy": 2.400329113006592, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2339789718389511, + "step": 3676 + }, + { + "epoch": 0.229875, + "grad_norm": 2.828125, + "grad_norm_var": 0.021223958333333334, + "learning_rate": 0.0001, + "loss": 7.7641, + "loss/crossentropy": 2.1139498949050903, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.24976061284542084, + "step": 3678 + }, + { + "epoch": 0.23, + "grad_norm": 2.875, + "grad_norm_var": 0.021142578125, + "learning_rate": 0.0001, + "loss": 8.0759, + "loss/crossentropy": 2.3943647146224976, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2582573890686035, + "step": 3680 + }, + { + "epoch": 0.230125, + "grad_norm": 3.046875, + "grad_norm_var": 0.02203369140625, + "learning_rate": 0.0001, + "loss": 8.1859, + "loss/crossentropy": 2.6016229391098022, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.2739368677139282, + "step": 3682 + }, + { + "epoch": 0.23025, + "grad_norm": 2.6875, + "grad_norm_var": 0.020796712239583334, + "learning_rate": 0.0001, + "loss": 7.8159, + "loss/crossentropy": 2.18959903717041, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24328485131263733, + "step": 3684 + }, + { + "epoch": 0.230375, + "grad_norm": 3.109375, + "grad_norm_var": 0.017704264322916666, + "learning_rate": 0.0001, + "loss": 8.11, + "loss/crossentropy": 2.5320764780044556, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24227013438940048, + "step": 3686 + }, + { + "epoch": 0.2305, + "grad_norm": 2.734375, + "grad_norm_var": 0.018512980143229166, + "learning_rate": 0.0001, + "loss": 7.8237, + "loss/crossentropy": 2.3462241888046265, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.25317418575286865, + "step": 3688 + }, + { + "epoch": 0.230625, + "grad_norm": 2.8125, + "grad_norm_var": 0.016243489583333333, + "learning_rate": 0.0001, + "loss": 7.8874, + "loss/crossentropy": 2.1772103309631348, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2475140392780304, + "step": 3690 + }, + { + "epoch": 0.23075, + "grad_norm": 2.703125, + "grad_norm_var": 0.0204254150390625, + "learning_rate": 0.0001, + "loss": 8.0884, + "loss/crossentropy": 2.16398286819458, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26599203050136566, + "step": 3692 + }, + { + "epoch": 0.230875, + "grad_norm": 2.90625, + "grad_norm_var": 0.018648274739583335, + "learning_rate": 0.0001, + "loss": 7.8644, + "loss/crossentropy": 2.2632850408554077, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24153126776218414, + "step": 3694 + }, + { + "epoch": 0.231, + "grad_norm": 2.921875, + "grad_norm_var": 0.019066365559895833, + "learning_rate": 0.0001, + "loss": 8.0305, + "loss/crossentropy": 2.3470261096954346, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24013527482748032, + "step": 3696 + }, + { + "epoch": 0.231125, + "grad_norm": 3.015625, + "grad_norm_var": 0.018748982747395834, + "learning_rate": 0.0001, + "loss": 7.8235, + "loss/crossentropy": 2.21561336517334, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2327084168791771, + "step": 3698 + }, + { + "epoch": 0.23125, + "grad_norm": 2.78125, + "grad_norm_var": 0.0171783447265625, + "learning_rate": 0.0001, + "loss": 7.9991, + "loss/crossentropy": 2.2310367822647095, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.242506742477417, + "step": 3700 + }, + { + "epoch": 0.231375, + "grad_norm": 3.078125, + "grad_norm_var": 0.016852823893229167, + "learning_rate": 0.0001, + "loss": 8.1388, + "loss/crossentropy": 2.515184164047241, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2872355580329895, + "step": 3702 + }, + { + "epoch": 0.2315, + "grad_norm": 2.640625, + "grad_norm_var": 0.019782511393229167, + "learning_rate": 0.0001, + "loss": 7.9631, + "loss/crossentropy": 2.2909129858016968, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23802363872528076, + "step": 3704 + }, + { + "epoch": 0.231625, + "grad_norm": 2.875, + "grad_norm_var": 0.017671712239583335, + "learning_rate": 0.0001, + "loss": 7.8775, + "loss/crossentropy": 2.387066602706909, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24512099474668503, + "step": 3706 + }, + { + "epoch": 0.23175, + "grad_norm": 2.765625, + "grad_norm_var": 0.014404296875, + "learning_rate": 0.0001, + "loss": 7.8625, + "loss/crossentropy": 2.254941940307617, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2513599842786789, + "step": 3708 + }, + { + "epoch": 0.231875, + "grad_norm": 2.828125, + "grad_norm_var": 0.013765462239583333, + "learning_rate": 0.0001, + "loss": 7.8971, + "loss/crossentropy": 2.142080545425415, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2329133376479149, + "step": 3710 + }, + { + "epoch": 0.232, + "grad_norm": 2.734375, + "grad_norm_var": 0.017878214518229168, + "learning_rate": 0.0001, + "loss": 7.7952, + "loss/crossentropy": 2.3408135175704956, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.245710588991642, + "step": 3712 + }, + { + "epoch": 0.232125, + "grad_norm": 2.75, + "grad_norm_var": 0.016499837239583332, + "learning_rate": 0.0001, + "loss": 7.9523, + "loss/crossentropy": 2.4113335609436035, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23740407824516296, + "step": 3714 + }, + { + "epoch": 0.23225, + "grad_norm": 2.9375, + "grad_norm_var": 0.020637003580729167, + "learning_rate": 0.0001, + "loss": 8.2332, + "loss/crossentropy": 2.397470474243164, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.3087628036737442, + "step": 3716 + }, + { + "epoch": 0.232375, + "grad_norm": 2.734375, + "grad_norm_var": 0.017769368489583333, + "learning_rate": 0.0001, + "loss": 8.0103, + "loss/crossentropy": 2.3515301942825317, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.24831266701221466, + "step": 3718 + }, + { + "epoch": 0.2325, + "grad_norm": 2.9375, + "grad_norm_var": 0.014957682291666666, + "learning_rate": 0.0001, + "loss": 7.916, + "loss/crossentropy": 2.222606897354126, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.24274057149887085, + "step": 3720 + }, + { + "epoch": 0.232625, + "grad_norm": 3.125, + "grad_norm_var": 0.019331868489583334, + "learning_rate": 0.0001, + "loss": 8.1821, + "loss/crossentropy": 2.3737378120422363, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24392034113407135, + "step": 3722 + }, + { + "epoch": 0.23275, + "grad_norm": 3.140625, + "grad_norm_var": 0.026953125, + "learning_rate": 0.0001, + "loss": 8.0112, + "loss/crossentropy": 2.5040173530578613, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.26996733248233795, + "step": 3724 + }, + { + "epoch": 0.232875, + "grad_norm": 2.71875, + "grad_norm_var": 0.028645833333333332, + "learning_rate": 0.0001, + "loss": 7.9624, + "loss/crossentropy": 2.274856448173523, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.26013311743736267, + "step": 3726 + }, + { + "epoch": 0.233, + "grad_norm": 2.734375, + "grad_norm_var": 0.028172810872395832, + "learning_rate": 0.0001, + "loss": 7.9561, + "loss/crossentropy": 2.2882769107818604, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.23719671368598938, + "step": 3728 + }, + { + "epoch": 0.233125, + "grad_norm": 2.859375, + "grad_norm_var": 0.0279449462890625, + "learning_rate": 0.0001, + "loss": 7.9788, + "loss/crossentropy": 2.222030520439148, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2667320519685745, + "step": 3730 + }, + { + "epoch": 0.23325, + "grad_norm": 2.890625, + "grad_norm_var": 0.0249664306640625, + "learning_rate": 0.0001, + "loss": 7.8122, + "loss/crossentropy": 2.127245843410492, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2510572522878647, + "step": 3732 + }, + { + "epoch": 0.233375, + "grad_norm": 2.84375, + "grad_norm_var": 0.025081380208333334, + "learning_rate": 0.0001, + "loss": 7.8065, + "loss/crossentropy": 2.412429094314575, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.2435067892074585, + "step": 3734 + }, + { + "epoch": 0.2335, + "grad_norm": 2.90625, + "grad_norm_var": 0.025406901041666666, + "learning_rate": 0.0001, + "loss": 7.9871, + "loss/crossentropy": 2.1098079085350037, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.28457003831863403, + "step": 3736 + }, + { + "epoch": 0.233625, + "grad_norm": 3.09375, + "grad_norm_var": 0.027164713541666666, + "learning_rate": 0.0001, + "loss": 7.9466, + "loss/crossentropy": 2.1928519010543823, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.25453677773475647, + "step": 3738 + }, + { + "epoch": 0.23375, + "grad_norm": 2.75, + "grad_norm_var": 0.0246978759765625, + "learning_rate": 0.0001, + "loss": 8.0324, + "loss/crossentropy": 2.5346169471740723, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2512079030275345, + "step": 3740 + }, + { + "epoch": 0.233875, + "grad_norm": 2.75, + "grad_norm_var": 0.025716145833333332, + "learning_rate": 0.0001, + "loss": 8.0987, + "loss/crossentropy": 2.3410524129867554, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2596224248409271, + "step": 3742 + }, + { + "epoch": 0.234, + "grad_norm": 2.859375, + "grad_norm_var": 0.022261555989583334, + "learning_rate": 0.0001, + "loss": 7.7984, + "loss/crossentropy": 2.353347420692444, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.26650838553905487, + "step": 3744 + }, + { + "epoch": 0.234125, + "grad_norm": 2.90625, + "grad_norm_var": 0.025519816080729167, + "learning_rate": 0.0001, + "loss": 8.1373, + "loss/crossentropy": 2.4095019102096558, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.26190927624702454, + "step": 3746 + }, + { + "epoch": 0.23425, + "grad_norm": 2.734375, + "grad_norm_var": 0.028937784830729167, + "learning_rate": 0.0001, + "loss": 7.6916, + "loss/crossentropy": 2.0359702110290527, + "loss/hidden": 3.0, + "loss/jsd": 0.0, + "loss/logits": 0.2307998687028885, + "step": 3748 + }, + { + "epoch": 0.234375, + "grad_norm": 3.015625, + "grad_norm_var": 0.028999837239583333, + "learning_rate": 0.0001, + "loss": 7.792, + "loss/crossentropy": 2.168085813522339, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23495958745479584, + "step": 3750 + }, + { + "epoch": 0.2345, + "grad_norm": 3.265625, + "grad_norm_var": 0.04117431640625, + "learning_rate": 0.0001, + "loss": 7.9602, + "loss/crossentropy": 2.026396870613098, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2430543154478073, + "step": 3752 + }, + { + "epoch": 0.234625, + "grad_norm": 3.015625, + "grad_norm_var": 0.037694295247395836, + "learning_rate": 0.0001, + "loss": 7.9993, + "loss/crossentropy": 2.4510369300842285, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.27556227147579193, + "step": 3754 + }, + { + "epoch": 0.23475, + "grad_norm": 2.875, + "grad_norm_var": 0.03394266764322917, + "learning_rate": 0.0001, + "loss": 7.9829, + "loss/crossentropy": 2.0846810340881348, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.235876202583313, + "step": 3756 + }, + { + "epoch": 0.234875, + "grad_norm": 2.84375, + "grad_norm_var": 0.031266276041666666, + "learning_rate": 0.0001, + "loss": 7.8668, + "loss/crossentropy": 2.279644012451172, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.250064454972744, + "step": 3758 + }, + { + "epoch": 0.235, + "grad_norm": 2.765625, + "grad_norm_var": 0.0311431884765625, + "learning_rate": 0.0001, + "loss": 7.9483, + "loss/crossentropy": 2.1563061475753784, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.258032388985157, + "step": 3760 + }, + { + "epoch": 0.235125, + "grad_norm": 2.65625, + "grad_norm_var": 0.031538899739583334, + "learning_rate": 0.0001, + "loss": 7.8565, + "loss/crossentropy": 2.071030616760254, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23593812435865402, + "step": 3762 + }, + { + "epoch": 0.23525, + "grad_norm": 3.0, + "grad_norm_var": 0.027887980143229168, + "learning_rate": 0.0001, + "loss": 8.0189, + "loss/crossentropy": 2.3263286352157593, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25163403153419495, + "step": 3764 + }, + { + "epoch": 0.235375, + "grad_norm": 2.75, + "grad_norm_var": 0.02740478515625, + "learning_rate": 0.0001, + "loss": 7.7014, + "loss/crossentropy": 2.456760048866272, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.23420259356498718, + "step": 3766 + }, + { + "epoch": 0.2355, + "grad_norm": 2.953125, + "grad_norm_var": 0.012044270833333334, + "learning_rate": 0.0001, + "loss": 8.0371, + "loss/crossentropy": 2.369943618774414, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2603626102209091, + "step": 3768 + }, + { + "epoch": 0.235625, + "grad_norm": 2.9375, + "grad_norm_var": 0.010106404622395834, + "learning_rate": 0.0001, + "loss": 8.1286, + "loss/crossentropy": 2.2771997451782227, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2527083158493042, + "step": 3770 + }, + { + "epoch": 0.23575, + "grad_norm": 2.84375, + "grad_norm_var": 0.0128326416015625, + "learning_rate": 0.0001, + "loss": 8.0419, + "loss/crossentropy": 2.3018823862075806, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.2760903537273407, + "step": 3772 + }, + { + "epoch": 0.235875, + "grad_norm": 2.6875, + "grad_norm_var": 0.0157135009765625, + "learning_rate": 0.0001, + "loss": 7.7634, + "loss/crossentropy": 2.147810459136963, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2414494976401329, + "step": 3774 + }, + { + "epoch": 0.236, + "grad_norm": 2.890625, + "grad_norm_var": 0.024267578125, + "learning_rate": 0.0001, + "loss": 8.0522, + "loss/crossentropy": 2.5797489881515503, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25508859008550644, + "step": 3776 + }, + { + "epoch": 0.236125, + "grad_norm": 2.890625, + "grad_norm_var": 0.02109375, + "learning_rate": 0.0001, + "loss": 8.0457, + "loss/crossentropy": 2.334934711456299, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24477149546146393, + "step": 3778 + }, + { + "epoch": 0.23625, + "grad_norm": 2.78125, + "grad_norm_var": 0.0207672119140625, + "learning_rate": 0.0001, + "loss": 7.7548, + "loss/crossentropy": 2.496270179748535, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.25811365991830826, + "step": 3780 + }, + { + "epoch": 0.236375, + "grad_norm": 2.96875, + "grad_norm_var": 0.019074503580729166, + "learning_rate": 0.0001, + "loss": 8.0027, + "loss/crossentropy": 2.565447449684143, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2610893249511719, + "step": 3782 + }, + { + "epoch": 0.2365, + "grad_norm": 2.84375, + "grad_norm_var": 0.022044881184895834, + "learning_rate": 0.0001, + "loss": 8.0034, + "loss/crossentropy": 2.288913130760193, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24560889601707458, + "step": 3784 + }, + { + "epoch": 0.236625, + "grad_norm": 2.921875, + "grad_norm_var": 0.0223297119140625, + "learning_rate": 0.0001, + "loss": 7.9241, + "loss/crossentropy": 2.4364962577819824, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.26674768328666687, + "step": 3786 + }, + { + "epoch": 0.23675, + "grad_norm": 2.875, + "grad_norm_var": 0.018504842122395834, + "learning_rate": 0.0001, + "loss": 8.1114, + "loss/crossentropy": 2.3721729516983032, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2508590742945671, + "step": 3788 + }, + { + "epoch": 0.236875, + "grad_norm": 2.90625, + "grad_norm_var": 0.014989217122395834, + "learning_rate": 0.0001, + "loss": 8.1629, + "loss/crossentropy": 2.441157817840576, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2336357906460762, + "step": 3790 + }, + { + "epoch": 0.237, + "grad_norm": 2.71875, + "grad_norm_var": 0.0092437744140625, + "learning_rate": 0.0001, + "loss": 8.0601, + "loss/crossentropy": 2.3760870695114136, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.24517250061035156, + "step": 3792 + }, + { + "epoch": 0.237125, + "grad_norm": 2.78125, + "grad_norm_var": 0.009635416666666667, + "learning_rate": 0.0001, + "loss": 7.7626, + "loss/crossentropy": 2.254652261734009, + "loss/hidden": 3.109375, + "loss/jsd": 0.0, + "loss/logits": 0.23266702890396118, + "step": 3794 + }, + { + "epoch": 0.23725, + "grad_norm": 2.953125, + "grad_norm_var": 0.010985310872395833, + "learning_rate": 0.0001, + "loss": 7.9964, + "loss/crossentropy": 2.3335236310958862, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24718395620584488, + "step": 3796 + }, + { + "epoch": 0.237375, + "grad_norm": 2.8125, + "grad_norm_var": 0.010505167643229167, + "learning_rate": 0.0001, + "loss": 7.9836, + "loss/crossentropy": 2.146742820739746, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23980651795864105, + "step": 3798 + }, + { + "epoch": 0.2375, + "grad_norm": 2.75, + "grad_norm_var": 0.0105133056640625, + "learning_rate": 0.0001, + "loss": 8.0158, + "loss/crossentropy": 2.413538336753845, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2546129822731018, + "step": 3800 + }, + { + "epoch": 0.237625, + "grad_norm": 2.78125, + "grad_norm_var": 0.010221354166666667, + "learning_rate": 0.0001, + "loss": 7.8914, + "loss/crossentropy": 2.3957451581954956, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2576509118080139, + "step": 3802 + }, + { + "epoch": 0.23775, + "grad_norm": 3.328125, + "grad_norm_var": 0.0312652587890625, + "learning_rate": 0.0001, + "loss": 7.8871, + "loss/crossentropy": 2.288939356803894, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2728182524442673, + "step": 3804 + }, + { + "epoch": 0.237875, + "grad_norm": 2.828125, + "grad_norm_var": 0.029573567708333335, + "learning_rate": 0.0001, + "loss": 7.9958, + "loss/crossentropy": 2.3543388843536377, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.25791068375110626, + "step": 3806 + }, + { + "epoch": 0.238, + "grad_norm": 3.078125, + "grad_norm_var": 0.0307769775390625, + "learning_rate": 0.0001, + "loss": 8.0146, + "loss/crossentropy": 2.452141761779785, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.23337870091199875, + "step": 3808 + }, + { + "epoch": 0.238125, + "grad_norm": 2.65625, + "grad_norm_var": 0.03359375, + "learning_rate": 0.0001, + "loss": 7.8997, + "loss/crossentropy": 2.3330483436584473, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.25078994035720825, + "step": 3810 + }, + { + "epoch": 0.23825, + "grad_norm": 2.734375, + "grad_norm_var": 0.03251851399739583, + "learning_rate": 0.0001, + "loss": 7.8959, + "loss/crossentropy": 2.0975863933563232, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2229529693722725, + "step": 3812 + }, + { + "epoch": 0.238375, + "grad_norm": 2.765625, + "grad_norm_var": 0.03323567708333333, + "learning_rate": 0.0001, + "loss": 7.816, + "loss/crossentropy": 2.0993716716766357, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.24615749716758728, + "step": 3814 + }, + { + "epoch": 0.2385, + "grad_norm": 3.171875, + "grad_norm_var": 0.037984212239583336, + "learning_rate": 0.0001, + "loss": 8.0614, + "loss/crossentropy": 2.336126208305359, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.24437396228313446, + "step": 3816 + }, + { + "epoch": 0.238625, + "grad_norm": 2.671875, + "grad_norm_var": 0.039948527018229166, + "learning_rate": 0.0001, + "loss": 7.9033, + "loss/crossentropy": 2.3778291940689087, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2515903264284134, + "step": 3818 + }, + { + "epoch": 0.23875, + "grad_norm": 2.921875, + "grad_norm_var": 0.019136555989583335, + "learning_rate": 0.0001, + "loss": 7.8767, + "loss/crossentropy": 2.2030457258224487, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2293495386838913, + "step": 3820 + }, + { + "epoch": 0.238875, + "grad_norm": 2.859375, + "grad_norm_var": 0.02066650390625, + "learning_rate": 0.0001, + "loss": 7.8254, + "loss/crossentropy": 2.393824577331543, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.23858655989170074, + "step": 3822 + }, + { + "epoch": 0.239, + "grad_norm": 2.640625, + "grad_norm_var": 0.019579060872395835, + "learning_rate": 0.0001, + "loss": 7.7794, + "loss/crossentropy": 2.202533006668091, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2381155639886856, + "step": 3824 + }, + { + "epoch": 0.239125, + "grad_norm": 2.71875, + "grad_norm_var": 0.017513020833333334, + "learning_rate": 0.0001, + "loss": 7.8826, + "loss/crossentropy": 2.2683571577072144, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.24590277671813965, + "step": 3826 + }, + { + "epoch": 0.23925, + "grad_norm": 3.046875, + "grad_norm_var": 0.0197662353515625, + "learning_rate": 0.0001, + "loss": 7.9919, + "loss/crossentropy": 2.3534783124923706, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2405027374625206, + "step": 3828 + }, + { + "epoch": 0.239375, + "grad_norm": 2.875, + "grad_norm_var": 0.020406087239583332, + "learning_rate": 0.0001, + "loss": 8.1266, + "loss/crossentropy": 2.3716864585876465, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2505777180194855, + "step": 3830 + }, + { + "epoch": 0.2395, + "grad_norm": 3.0, + "grad_norm_var": 0.015751139322916666, + "learning_rate": 0.0001, + "loss": 7.947, + "loss/crossentropy": 2.482056736946106, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.2586553245782852, + "step": 3832 + }, + { + "epoch": 0.239625, + "grad_norm": 2.953125, + "grad_norm_var": 0.016730753580729167, + "learning_rate": 0.0001, + "loss": 8.0148, + "loss/crossentropy": 2.323951005935669, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.24310573935508728, + "step": 3834 + }, + { + "epoch": 0.23975, + "grad_norm": 3.3125, + "grad_norm_var": 0.027425130208333332, + "learning_rate": 0.0001, + "loss": 8.0583, + "loss/crossentropy": 2.2353726625442505, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.2633824050426483, + "step": 3836 + }, + { + "epoch": 0.239875, + "grad_norm": 2.828125, + "grad_norm_var": 0.02568359375, + "learning_rate": 0.0001, + "loss": 7.9074, + "loss/crossentropy": 2.1682406663894653, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.25316306948661804, + "step": 3838 + }, + { + "epoch": 0.24, + "grad_norm": 2.671875, + "grad_norm_var": 0.0255767822265625, + "learning_rate": 0.0001, + "loss": 7.76, + "loss/crossentropy": 2.3561817407608032, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.24703025817871094, + "step": 3840 + }, + { + "epoch": 0.240125, + "grad_norm": 2.84375, + "grad_norm_var": 0.023346964518229166, + "learning_rate": 0.0001, + "loss": 7.8848, + "loss/crossentropy": 2.1207789182662964, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.24431215226650238, + "step": 3842 + }, + { + "epoch": 0.24025, + "grad_norm": 2.796875, + "grad_norm_var": 0.0224761962890625, + "learning_rate": 0.0001, + "loss": 7.8809, + "loss/crossentropy": 2.177114486694336, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24755840003490448, + "step": 3844 + }, + { + "epoch": 0.240375, + "grad_norm": 2.734375, + "grad_norm_var": 0.02340087890625, + "learning_rate": 0.0001, + "loss": 7.8049, + "loss/crossentropy": 2.241698145866394, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.23916028439998627, + "step": 3846 + }, + { + "epoch": 0.2405, + "grad_norm": 2.9375, + "grad_norm_var": 0.023209635416666666, + "learning_rate": 0.0001, + "loss": 8.1238, + "loss/crossentropy": 2.3270636796951294, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2830345705151558, + "step": 3848 + }, + { + "epoch": 0.240625, + "grad_norm": 2.75, + "grad_norm_var": 0.02125244140625, + "learning_rate": 0.0001, + "loss": 7.9985, + "loss/crossentropy": 2.306910753250122, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24260255694389343, + "step": 3850 + }, + { + "epoch": 0.24075, + "grad_norm": 2.65625, + "grad_norm_var": 0.0091461181640625, + "learning_rate": 0.0001, + "loss": 7.898, + "loss/crossentropy": 2.0596543550491333, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23719244450330734, + "step": 3852 + }, + { + "epoch": 0.240875, + "grad_norm": 2.78125, + "grad_norm_var": 0.009300740559895833, + "learning_rate": 0.0001, + "loss": 7.8158, + "loss/crossentropy": 2.3392993211746216, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.25461700558662415, + "step": 3854 + }, + { + "epoch": 0.241, + "grad_norm": 3.015625, + "grad_norm_var": 0.011649576822916667, + "learning_rate": 0.0001, + "loss": 7.9635, + "loss/crossentropy": 2.7126669883728027, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2701471447944641, + "step": 3856 + }, + { + "epoch": 0.241125, + "grad_norm": 2.5625, + "grad_norm_var": 0.01881103515625, + "learning_rate": 0.0001, + "loss": 7.8456, + "loss/crossentropy": 2.2381919622421265, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2312241867184639, + "step": 3858 + }, + { + "epoch": 0.24125, + "grad_norm": 2.734375, + "grad_norm_var": 0.019188435872395833, + "learning_rate": 0.0001, + "loss": 7.8947, + "loss/crossentropy": 2.3995821475982666, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24393048137426376, + "step": 3860 + }, + { + "epoch": 0.241375, + "grad_norm": 2.84375, + "grad_norm_var": 0.0181060791015625, + "learning_rate": 0.0001, + "loss": 7.9092, + "loss/crossentropy": 2.254945397377014, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24428366869688034, + "step": 3862 + }, + { + "epoch": 0.2415, + "grad_norm": 2.640625, + "grad_norm_var": 0.0165924072265625, + "learning_rate": 0.0001, + "loss": 7.8811, + "loss/crossentropy": 2.392805576324463, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.24890445917844772, + "step": 3864 + }, + { + "epoch": 0.241625, + "grad_norm": 2.90625, + "grad_norm_var": 0.016923014322916666, + "learning_rate": 0.0001, + "loss": 8.0593, + "loss/crossentropy": 2.224582314491272, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2585388273000717, + "step": 3866 + }, + { + "epoch": 0.24175, + "grad_norm": 2.859375, + "grad_norm_var": 0.015851847330729165, + "learning_rate": 0.0001, + "loss": 8.0158, + "loss/crossentropy": 2.367936611175537, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.23479987680912018, + "step": 3868 + }, + { + "epoch": 0.241875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0171295166015625, + "learning_rate": 0.0001, + "loss": 7.9695, + "loss/crossentropy": 2.394118547439575, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.2584435045719147, + "step": 3870 + }, + { + "epoch": 0.242, + "grad_norm": 3.234375, + "grad_norm_var": 0.0255859375, + "learning_rate": 0.0001, + "loss": 8.2134, + "loss/crossentropy": 2.4198135137557983, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2768760621547699, + "step": 3872 + }, + { + "epoch": 0.242125, + "grad_norm": 2.84375, + "grad_norm_var": 0.019270833333333334, + "learning_rate": 0.0001, + "loss": 7.9406, + "loss/crossentropy": 2.373674988746643, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24084321409463882, + "step": 3874 + }, + { + "epoch": 0.24225, + "grad_norm": 2.859375, + "grad_norm_var": 0.017186482747395832, + "learning_rate": 0.0001, + "loss": 8.0494, + "loss/crossentropy": 2.1835745573043823, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.23795650899410248, + "step": 3876 + }, + { + "epoch": 0.242375, + "grad_norm": 3.15625, + "grad_norm_var": 0.020699055989583333, + "learning_rate": 0.0001, + "loss": 8.1841, + "loss/crossentropy": 2.590933918952942, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.2828524559736252, + "step": 3878 + }, + { + "epoch": 0.2425, + "grad_norm": 2.6875, + "grad_norm_var": 0.019820149739583334, + "learning_rate": 0.0001, + "loss": 7.9971, + "loss/crossentropy": 2.4417784214019775, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.2358105331659317, + "step": 3880 + }, + { + "epoch": 0.242625, + "grad_norm": 2.875, + "grad_norm_var": 0.0228515625, + "learning_rate": 0.0001, + "loss": 7.927, + "loss/crossentropy": 2.514025568962097, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25648288428783417, + "step": 3882 + }, + { + "epoch": 0.24275, + "grad_norm": 2.90625, + "grad_norm_var": 0.022574869791666667, + "learning_rate": 0.0001, + "loss": 8.0021, + "loss/crossentropy": 2.2680565118789673, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.24664485454559326, + "step": 3884 + }, + { + "epoch": 0.242875, + "grad_norm": 2.828125, + "grad_norm_var": 0.0239898681640625, + "learning_rate": 0.0001, + "loss": 7.648, + "loss/crossentropy": 2.0859988927841187, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2396468073129654, + "step": 3886 + }, + { + "epoch": 0.243, + "grad_norm": 2.921875, + "grad_norm_var": 0.016022745768229166, + "learning_rate": 0.0001, + "loss": 7.9453, + "loss/crossentropy": 2.346192240715027, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2454293891787529, + "step": 3888 + }, + { + "epoch": 0.243125, + "grad_norm": 2.734375, + "grad_norm_var": 0.016630045572916665, + "learning_rate": 0.0001, + "loss": 7.8395, + "loss/crossentropy": 2.2563360929489136, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25612016022205353, + "step": 3890 + }, + { + "epoch": 0.24325, + "grad_norm": 3.0, + "grad_norm_var": 0.017838541666666666, + "learning_rate": 0.0001, + "loss": 7.9441, + "loss/crossentropy": 2.5668708086013794, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.27752047777175903, + "step": 3892 + }, + { + "epoch": 0.243375, + "grad_norm": 3.015625, + "grad_norm_var": 0.013785807291666667, + "learning_rate": 0.0001, + "loss": 7.7886, + "loss/crossentropy": 2.1933363676071167, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.23820041120052338, + "step": 3894 + }, + { + "epoch": 0.2435, + "grad_norm": 2.828125, + "grad_norm_var": 0.010480753580729167, + "learning_rate": 0.0001, + "loss": 7.9344, + "loss/crossentropy": 2.3005311489105225, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24230662733316422, + "step": 3896 + }, + { + "epoch": 0.243625, + "grad_norm": 2.765625, + "grad_norm_var": 0.009012858072916666, + "learning_rate": 0.0001, + "loss": 7.9342, + "loss/crossentropy": 2.390279769897461, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24232257902622223, + "step": 3898 + }, + { + "epoch": 0.24375, + "grad_norm": 2.96875, + "grad_norm_var": 0.007624308268229167, + "learning_rate": 0.0001, + "loss": 7.9865, + "loss/crossentropy": 2.3709793090820312, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.27256976068019867, + "step": 3900 + }, + { + "epoch": 0.243875, + "grad_norm": 2.6875, + "grad_norm_var": 0.008885701497395834, + "learning_rate": 0.0001, + "loss": 7.7007, + "loss/crossentropy": 2.0079593658447266, + "loss/hidden": 3.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.223800927400589, + "step": 3902 + }, + { + "epoch": 0.244, + "grad_norm": 3.046875, + "grad_norm_var": 0.010774739583333333, + "learning_rate": 0.0001, + "loss": 8.0776, + "loss/crossentropy": 2.421340227127075, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.25970883667469025, + "step": 3904 + }, + { + "epoch": 0.244125, + "grad_norm": 3.09375, + "grad_norm_var": 0.011644490559895833, + "learning_rate": 0.0001, + "loss": 7.944, + "loss/crossentropy": 2.234217405319214, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24383071064949036, + "step": 3906 + }, + { + "epoch": 0.24425, + "grad_norm": 2.921875, + "grad_norm_var": 0.010749308268229167, + "learning_rate": 0.0001, + "loss": 8.063, + "loss/crossentropy": 2.5897743701934814, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.25560788810253143, + "step": 3908 + }, + { + "epoch": 0.244375, + "grad_norm": 3.25, + "grad_norm_var": 0.09846598307291667, + "learning_rate": 0.0001, + "loss": 7.9418, + "loss/crossentropy": 2.1307941675186157, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.26282523572444916, + "step": 3910 + }, + { + "epoch": 0.2445, + "grad_norm": 3.03125, + "grad_norm_var": 0.0960357666015625, + "learning_rate": 0.0001, + "loss": 8.0311, + "loss/crossentropy": 2.1970854997634888, + "loss/hidden": 3.0625, + "loss/jsd": 0.0, + "loss/logits": 0.24500936269760132, + "step": 3912 + }, + { + "epoch": 0.244625, + "grad_norm": 2.84375, + "grad_norm_var": 0.09399312337239583, + "learning_rate": 0.0001, + "loss": 8.0802, + "loss/crossentropy": 2.384564757347107, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.26147788763046265, + "step": 3914 + }, + { + "epoch": 0.24475, + "grad_norm": 2.765625, + "grad_norm_var": 0.09719645182291667, + "learning_rate": 0.0001, + "loss": 7.9915, + "loss/crossentropy": 2.3258496522903442, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.24284511804580688, + "step": 3916 + }, + { + "epoch": 0.244875, + "grad_norm": 2.671875, + "grad_norm_var": 0.09752197265625, + "learning_rate": 0.0001, + "loss": 7.7315, + "loss/crossentropy": 2.225629687309265, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25035693496465683, + "step": 3918 + }, + { + "epoch": 0.245, + "grad_norm": 2.84375, + "grad_norm_var": 0.09841206868489584, + "learning_rate": 0.0001, + "loss": 7.9674, + "loss/crossentropy": 2.0987013578414917, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23937055468559265, + "step": 3920 + }, + { + "epoch": 0.245125, + "grad_norm": 2.875, + "grad_norm_var": 0.09804280598958333, + "learning_rate": 0.0001, + "loss": 8.0213, + "loss/crossentropy": 2.294238328933716, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.25349240005016327, + "step": 3922 + }, + { + "epoch": 0.24525, + "grad_norm": 2.8125, + "grad_norm_var": 0.10007222493489583, + "learning_rate": 0.0001, + "loss": 8.0705, + "loss/crossentropy": 2.1828103065490723, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25784528255462646, + "step": 3924 + }, + { + "epoch": 0.245375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0164703369140625, + "learning_rate": 0.0001, + "loss": 7.986, + "loss/crossentropy": 2.023577570915222, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.24002478271722794, + "step": 3926 + }, + { + "epoch": 0.2455, + "grad_norm": 2.9375, + "grad_norm_var": 0.01533203125, + "learning_rate": 0.0001, + "loss": 7.9938, + "loss/crossentropy": 2.319468140602112, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2421136051416397, + "step": 3928 + }, + { + "epoch": 0.245625, + "grad_norm": 2.734375, + "grad_norm_var": 0.020406087239583332, + "learning_rate": 0.0001, + "loss": 7.8347, + "loss/crossentropy": 2.2601557970046997, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.2663475573062897, + "step": 3930 + }, + { + "epoch": 0.24575, + "grad_norm": 3.265625, + "grad_norm_var": 0.025178019205729166, + "learning_rate": 0.0001, + "loss": 8.3085, + "loss/crossentropy": 2.735992908477783, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2462017834186554, + "step": 3932 + }, + { + "epoch": 0.245875, + "grad_norm": 2.90625, + "grad_norm_var": 0.019774373372395834, + "learning_rate": 0.0001, + "loss": 7.9586, + "loss/crossentropy": 2.361487627029419, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24462847411632538, + "step": 3934 + }, + { + "epoch": 0.246, + "grad_norm": 2.796875, + "grad_norm_var": 0.031050618489583334, + "learning_rate": 0.0001, + "loss": 7.8382, + "loss/crossentropy": 2.1037662029266357, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.24985255300998688, + "step": 3936 + }, + { + "epoch": 0.246125, + "grad_norm": 2.828125, + "grad_norm_var": 0.03432515462239583, + "learning_rate": 0.0001, + "loss": 7.9081, + "loss/crossentropy": 2.383851647377014, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.23125454783439636, + "step": 3938 + }, + { + "epoch": 0.24625, + "grad_norm": 3.0, + "grad_norm_var": 0.03535054524739583, + "learning_rate": 0.0001, + "loss": 8.0179, + "loss/crossentropy": 2.383346676826477, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2499464601278305, + "step": 3940 + }, + { + "epoch": 0.246375, + "grad_norm": 2.796875, + "grad_norm_var": 0.0344146728515625, + "learning_rate": 0.0001, + "loss": 7.9388, + "loss/crossentropy": 2.221991539001465, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.25429509580135345, + "step": 3942 + }, + { + "epoch": 0.2465, + "grad_norm": 2.8125, + "grad_norm_var": 0.035074869791666664, + "learning_rate": 0.0001, + "loss": 7.7599, + "loss/crossentropy": 2.186113476753235, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.22729599475860596, + "step": 3944 + }, + { + "epoch": 0.246625, + "grad_norm": 2.890625, + "grad_norm_var": 0.030223592122395834, + "learning_rate": 0.0001, + "loss": 7.8738, + "loss/crossentropy": 2.5532344579696655, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2494053915143013, + "step": 3946 + }, + { + "epoch": 0.24675, + "grad_norm": 2.953125, + "grad_norm_var": 0.02056884765625, + "learning_rate": 0.0001, + "loss": 7.9992, + "loss/crossentropy": 2.3294448852539062, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.24838833510875702, + "step": 3948 + }, + { + "epoch": 0.246875, + "grad_norm": 3.296875, + "grad_norm_var": 0.02877197265625, + "learning_rate": 0.0001, + "loss": 7.8963, + "loss/crossentropy": 2.1606216430664062, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25755712389945984, + "step": 3950 + }, + { + "epoch": 0.247, + "grad_norm": 2.671875, + "grad_norm_var": 0.0237457275390625, + "learning_rate": 0.0001, + "loss": 7.9671, + "loss/crossentropy": 2.2953662872314453, + "loss/hidden": 3.125, + "loss/jsd": 0.0, + "loss/logits": 0.2478877156972885, + "step": 3952 + }, + { + "epoch": 0.247125, + "grad_norm": 2.953125, + "grad_norm_var": 0.022272745768229168, + "learning_rate": 0.0001, + "loss": 8.0112, + "loss/crossentropy": 2.187902808189392, + "loss/hidden": 3.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.2752064913511276, + "step": 3954 + }, + { + "epoch": 0.24725, + "grad_norm": 3.015625, + "grad_norm_var": 0.021805826822916666, + "learning_rate": 0.0001, + "loss": 7.9066, + "loss/crossentropy": 2.289653182029724, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.24460161477327347, + "step": 3956 + }, + { + "epoch": 0.247375, + "grad_norm": 2.828125, + "grad_norm_var": 0.021711222330729165, + "learning_rate": 0.0001, + "loss": 7.7137, + "loss/crossentropy": 1.9959831833839417, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.22675126791000366, + "step": 3958 + }, + { + "epoch": 0.2475, + "grad_norm": 3.0625, + "grad_norm_var": 0.023502604166666666, + "learning_rate": 0.0001, + "loss": 8.0294, + "loss/crossentropy": 2.286497712135315, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.2442842796444893, + "step": 3960 + }, + { + "epoch": 0.247625, + "grad_norm": 2.75, + "grad_norm_var": 0.024137369791666665, + "learning_rate": 0.0001, + "loss": 7.9915, + "loss/crossentropy": 2.2502633333206177, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2236596718430519, + "step": 3962 + }, + { + "epoch": 0.24775, + "grad_norm": 2.75, + "grad_norm_var": 0.025169881184895833, + "learning_rate": 0.0001, + "loss": 7.9153, + "loss/crossentropy": 2.2781342267990112, + "loss/hidden": 3.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.24574154615402222, + "step": 3964 + }, + { + "epoch": 0.247875, + "grad_norm": 2.90625, + "grad_norm_var": 0.013765462239583333, + "learning_rate": 0.0001, + "loss": 7.8426, + "loss/crossentropy": 2.119445323944092, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.23813114315271378, + "step": 3966 + }, + { + "epoch": 0.248, + "grad_norm": 3.03125, + "grad_norm_var": 0.013841756184895833, + "learning_rate": 0.0001, + "loss": 8.0836, + "loss/crossentropy": 2.188897430896759, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2473863735795021, + "step": 3968 + }, + { + "epoch": 0.248125, + "grad_norm": 2.828125, + "grad_norm_var": 0.0142486572265625, + "learning_rate": 0.0001, + "loss": 7.9057, + "loss/crossentropy": 2.279172897338867, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.26379649341106415, + "step": 3970 + }, + { + "epoch": 0.24825, + "grad_norm": 2.875, + "grad_norm_var": 0.012116495768229167, + "learning_rate": 0.0001, + "loss": 7.7416, + "loss/crossentropy": 2.3670825958251953, + "loss/hidden": 3.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.22166435420513153, + "step": 3972 + }, + { + "epoch": 0.248375, + "grad_norm": 2.984375, + "grad_norm_var": 0.014484659830729166, + "learning_rate": 0.0001, + "loss": 7.9191, + "loss/crossentropy": 2.4747731685638428, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.24381603300571442, + "step": 3974 + }, + { + "epoch": 0.2485, + "grad_norm": 2.90625, + "grad_norm_var": 0.0111968994140625, + "learning_rate": 0.0001, + "loss": 7.9802, + "loss/crossentropy": 2.2822595834732056, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.24216417968273163, + "step": 3976 + }, + { + "epoch": 0.248625, + "grad_norm": 2.671875, + "grad_norm_var": 0.013016764322916667, + "learning_rate": 0.0001, + "loss": 7.855, + "loss/crossentropy": 2.4248218536376953, + "loss/hidden": 3.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.2518390789628029, + "step": 3978 + }, + { + "epoch": 0.24875, + "grad_norm": 2.9375, + "grad_norm_var": 0.012300618489583333, + "learning_rate": 0.0001, + "loss": 7.9805, + "loss/crossentropy": 2.379852533340454, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.2360517606139183, + "step": 3980 + }, + { + "epoch": 0.248875, + "grad_norm": 2.90625, + "grad_norm_var": 0.010431925455729166, + "learning_rate": 0.0001, + "loss": 7.9083, + "loss/crossentropy": 2.3196099996566772, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23856279253959656, + "step": 3982 + }, + { + "epoch": 0.249, + "grad_norm": 2.9375, + "grad_norm_var": 0.011432902018229166, + "learning_rate": 0.0001, + "loss": 7.898, + "loss/crossentropy": 2.286087989807129, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.24119911342859268, + "step": 3984 + }, + { + "epoch": 0.249125, + "grad_norm": 2.953125, + "grad_norm_var": 0.010904947916666666, + "learning_rate": 0.0001, + "loss": 7.9097, + "loss/crossentropy": 2.2220507860183716, + "loss/hidden": 3.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2348850816488266, + "step": 3986 + }, + { + "epoch": 0.24925, + "grad_norm": 2.875, + "grad_norm_var": 0.010933430989583333, + "learning_rate": 0.0001, + "loss": 8.1498, + "loss/crossentropy": 2.376457691192627, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.2478000372648239, + "step": 3988 + }, + { + "epoch": 0.249375, + "grad_norm": 2.78125, + "grad_norm_var": 0.008552042643229167, + "learning_rate": 0.0001, + "loss": 7.9849, + "loss/crossentropy": 2.367288827896118, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.25519511103630066, + "step": 3990 + }, + { + "epoch": 0.2495, + "grad_norm": 3.09375, + "grad_norm_var": 0.012495930989583333, + "learning_rate": 0.0001, + "loss": 8.0479, + "loss/crossentropy": 2.317150592803955, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.25810791552066803, + "step": 3992 + }, + { + "epoch": 0.249625, + "grad_norm": 2.78125, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 7.8396, + "loss/crossentropy": 2.1635915637016296, + "loss/hidden": 3.140625, + "loss/jsd": 0.0, + "loss/logits": 0.2583113983273506, + "step": 3994 + }, + { + "epoch": 0.24975, + "grad_norm": 2.640625, + "grad_norm_var": 0.013451131184895833, + "learning_rate": 0.0001, + "loss": 7.8905, + "loss/crossentropy": 2.036882519721985, + "loss/hidden": 3.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.23229141533374786, + "step": 3996 + }, + { + "epoch": 0.249875, + "grad_norm": 2.78125, + "grad_norm_var": 0.01529541015625, + "learning_rate": 0.0001, + "loss": 7.9271, + "loss/crossentropy": 2.4220268726348877, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2348419427871704, + "step": 3998 + }, + { + "epoch": 0.25, + "grad_norm": 2.75, + "grad_norm_var": 0.013655598958333333, + "learning_rate": 0.0001, + "loss": 7.6323, + "loss/crossentropy": 2.0106801986694336, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2393062487244606, + "step": 4000 + } + ], + "logging_steps": 2, + "max_steps": 16000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.16590621310976e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}