{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 2000, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 408.0, "learning_rate": 1.18e-05, "loss": 99.4106, "loss/crossentropy": 9.463456630706787, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 7.299906253814697, "step": 2 }, { "epoch": 0.00025, "grad_norm": 394.0, "learning_rate": 1.3600000000000002e-05, "loss": 98.4945, "loss/crossentropy": 9.324356079101562, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 7.335062265396118, "step": 4 }, { "epoch": 0.000375, "grad_norm": 400.0, "learning_rate": 1.54e-05, "loss": 98.8084, "loss/crossentropy": 9.356433391571045, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 7.372653961181641, "step": 6 }, { "epoch": 0.0005, "grad_norm": 200.0, "learning_rate": 1.72e-05, "loss": 95.0285, "loss/crossentropy": 8.909065246582031, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 6.833947658538818, "step": 8 }, { "epoch": 0.000625, "grad_norm": 168.0, "learning_rate": 1.9e-05, "loss": 91.8288, "loss/crossentropy": 8.619627475738525, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 6.534380674362183, "step": 10 }, { "epoch": 0.00075, "grad_norm": 142.0, "learning_rate": 2.0800000000000004e-05, "loss": 87.6764, "loss/crossentropy": 8.3522367477417, "loss/hidden": 16.3125, "loss/jsd": 0.0, "loss/logits": 6.020437479019165, "step": 12 }, { "epoch": 0.000875, "grad_norm": 120.0, "learning_rate": 2.2600000000000004e-05, "loss": 86.5162, "loss/crossentropy": 8.181874752044678, "loss/hidden": 16.25, "loss/jsd": 0.0, "loss/logits": 6.170382499694824, "step": 14 }, { "epoch": 0.001, "grad_norm": 102.0, "grad_norm_var": 17309.115625, "learning_rate": 2.4400000000000004e-05, "loss": 83.2473, "loss/crossentropy": 7.804777383804321, "loss/hidden": 15.875, "loss/jsd": 0.0, "loss/logits": 5.761872053146362, "step": 16 }, { "epoch": 0.001125, "grad_norm": 98.0, "grad_norm_var": 16404.073958333334, "learning_rate": 2.6200000000000003e-05, "loss": 82.994, "loss/crossentropy": 8.070159673690796, "loss/hidden": 15.4375, "loss/jsd": 0.0, "loss/logits": 6.122724771499634, "step": 18 }, { "epoch": 0.00125, "grad_norm": 91.0, "grad_norm_var": 13484.980989583333, "learning_rate": 2.8000000000000003e-05, "loss": 75.5942, "loss/crossentropy": 7.523748397827148, "loss/hidden": 15.21875, "loss/jsd": 0.0, "loss/logits": 5.568537950515747, "step": 20 }, { "epoch": 0.001375, "grad_norm": 42.75, "grad_norm_var": 7560.057291666667, "learning_rate": 2.9800000000000006e-05, "loss": 74.5522, "loss/crossentropy": 7.2410197257995605, "loss/hidden": 15.0, "loss/jsd": 0.0, "loss/logits": 5.393213272094727, "step": 22 }, { "epoch": 0.0015, "grad_norm": 47.25, "grad_norm_var": 7167.35, "learning_rate": 3.16e-05, "loss": 70.721, "loss/crossentropy": 6.929592132568359, "loss/hidden": 15.0, "loss/jsd": 0.0, "loss/logits": 5.017954587936401, "step": 24 }, { "epoch": 0.001625, "grad_norm": 68.0, "grad_norm_var": 7036.907291666666, "learning_rate": 3.3400000000000005e-05, "loss": 67.2375, "loss/crossentropy": 6.483344078063965, "loss/hidden": 14.90625, "loss/jsd": 0.0, "loss/logits": 4.4856719970703125, "step": 26 }, { "epoch": 0.00175, "grad_norm": 85.5, "grad_norm_var": 7041.079166666666, "learning_rate": 3.520000000000001e-05, "loss": 62.0161, "loss/crossentropy": 6.2141289710998535, "loss/hidden": 13.9375, "loss/jsd": 0.0, "loss/logits": 3.9922406673431396, "step": 28 }, { "epoch": 0.001875, "grad_norm": 55.0, "grad_norm_var": 7087.145833333333, "learning_rate": 3.7e-05, "loss": 58.0482, "loss/crossentropy": 5.753799676895142, "loss/hidden": 13.625, "loss/jsd": 0.0, "loss/logits": 3.7023929357528687, "step": 30 }, { "epoch": 0.002, "grad_norm": 61.75, "grad_norm_var": 7260.804166666667, "learning_rate": 3.88e-05, "loss": 54.3741, "loss/crossentropy": 5.369441032409668, "loss/hidden": 13.40625, "loss/jsd": 0.0, "loss/logits": 3.5642011165618896, "step": 32 }, { "epoch": 0.002125, "grad_norm": 64.5, "grad_norm_var": 891.0416666666666, "learning_rate": 4.0600000000000004e-05, "loss": 49.2896, "loss/crossentropy": 5.303982257843018, "loss/hidden": 13.0, "loss/jsd": 0.0, "loss/logits": 3.1789742708206177, "step": 34 }, { "epoch": 0.00225, "grad_norm": 59.5, "grad_norm_var": 854.5059895833333, "learning_rate": 4.240000000000001e-05, "loss": 45.052, "loss/crossentropy": 4.87110447883606, "loss/hidden": 12.40625, "loss/jsd": 0.0, "loss/logits": 2.8510398864746094, "step": 36 }, { "epoch": 0.002375, "grad_norm": 55.0, "grad_norm_var": 841.62890625, "learning_rate": 4.420000000000001e-05, "loss": 41.4091, "loss/crossentropy": 4.75779914855957, "loss/hidden": 11.84375, "loss/jsd": 0.0, "loss/logits": 2.4686522483825684, "step": 38 }, { "epoch": 0.0025, "grad_norm": 43.75, "grad_norm_var": 855.7958333333333, "learning_rate": 4.600000000000001e-05, "loss": 38.7329, "loss/crossentropy": 4.4945924282073975, "loss/hidden": 11.4375, "loss/jsd": 0.0, "loss/logits": 2.3037325143814087, "step": 40 }, { "epoch": 0.002625, "grad_norm": 45.25, "grad_norm_var": 894.5875, "learning_rate": 4.78e-05, "loss": 36.1262, "loss/crossentropy": 4.092380046844482, "loss/hidden": 11.03125, "loss/jsd": 0.0, "loss/logits": 2.2024214267730713, "step": 42 }, { "epoch": 0.00275, "grad_norm": 30.875, "grad_norm_var": 165.4806640625, "learning_rate": 4.96e-05, "loss": 34.0356, "loss/crossentropy": 3.9454950094223022, "loss/hidden": 10.5, "loss/jsd": 0.0, "loss/logits": 1.9096736907958984, "step": 44 }, { "epoch": 0.002875, "grad_norm": 31.5, "grad_norm_var": 190.79368489583334, "learning_rate": 5.14e-05, "loss": 31.1559, "loss/crossentropy": 3.7383527755737305, "loss/hidden": 9.96875, "loss/jsd": 0.0, "loss/logits": 1.6771507859230042, "step": 46 }, { "epoch": 0.003, "grad_norm": 28.125, "grad_norm_var": 214.92604166666666, "learning_rate": 5.3200000000000006e-05, "loss": 30.2863, "loss/crossentropy": 3.717210292816162, "loss/hidden": 9.90625, "loss/jsd": 0.0, "loss/logits": 1.7804301381111145, "step": 48 }, { "epoch": 0.003125, "grad_norm": 26.375, "grad_norm_var": 156.2681640625, "learning_rate": 5.500000000000001e-05, "loss": 28.8697, "loss/crossentropy": 3.8603330850601196, "loss/hidden": 9.40625, "loss/jsd": 0.0, "loss/logits": 1.5492290258407593, "step": 50 }, { "epoch": 0.00325, "grad_norm": 23.125, "grad_norm_var": 97.6994140625, "learning_rate": 5.680000000000001e-05, "loss": 27.3189, "loss/crossentropy": 3.6423943042755127, "loss/hidden": 9.28125, "loss/jsd": 0.0, "loss/logits": 1.4414083361625671, "step": 52 }, { "epoch": 0.003375, "grad_norm": 31.75, "grad_norm_var": 54.634375, "learning_rate": 5.860000000000001e-05, "loss": 26.6009, "loss/crossentropy": 3.383611798286438, "loss/hidden": 9.0, "loss/jsd": 0.0, "loss/logits": 1.3774727582931519, "step": 54 }, { "epoch": 0.0035, "grad_norm": 17.5, "grad_norm_var": 56.4431640625, "learning_rate": 6.040000000000001e-05, "loss": 25.9301, "loss/crossentropy": 3.3406291007995605, "loss/hidden": 8.90625, "loss/jsd": 0.0, "loss/logits": 1.3027867674827576, "step": 56 }, { "epoch": 0.003625, "grad_norm": 19.625, "grad_norm_var": 805.69140625, "learning_rate": 6.220000000000001e-05, "loss": 24.8141, "loss/crossentropy": 3.271217703819275, "loss/hidden": 8.71875, "loss/jsd": 0.0, "loss/logits": 1.3078763484954834, "step": 58 }, { "epoch": 0.00375, "grad_norm": 19.875, "grad_norm_var": 835.2473307291667, "learning_rate": 6.400000000000001e-05, "loss": 24.7392, "loss/crossentropy": 3.1293649673461914, "loss/hidden": 8.4375, "loss/jsd": 0.0, "loss/logits": 1.280139446258545, "step": 60 }, { "epoch": 0.003875, "grad_norm": 19.25, "grad_norm_var": 852.6686848958333, "learning_rate": 6.58e-05, "loss": 22.9728, "loss/crossentropy": 3.0743978023529053, "loss/hidden": 8.109375, "loss/jsd": 0.0, "loss/logits": 1.1214115023612976, "step": 62 }, { "epoch": 0.004, "grad_norm": 22.625, "grad_norm_var": 859.1119140625, "learning_rate": 6.76e-05, "loss": 23.0224, "loss/crossentropy": 3.3769785165786743, "loss/hidden": 8.25, "loss/jsd": 0.0, "loss/logits": 1.1677427291870117, "step": 64 }, { "epoch": 0.004125, "grad_norm": 18.25, "grad_norm_var": 869.5059895833333, "learning_rate": 6.94e-05, "loss": 22.2622, "loss/crossentropy": 3.1320669651031494, "loss/hidden": 7.8125, "loss/jsd": 0.0, "loss/logits": 1.08091539144516, "step": 66 }, { "epoch": 0.00425, "grad_norm": 20.0, "grad_norm_var": 884.07265625, "learning_rate": 7.120000000000001e-05, "loss": 21.2449, "loss/crossentropy": 3.257786750793457, "loss/hidden": 7.65625, "loss/jsd": 0.0, "loss/logits": 1.028902530670166, "step": 68 }, { "epoch": 0.004375, "grad_norm": 13.6875, "grad_norm_var": 899.974853515625, "learning_rate": 7.3e-05, "loss": 20.7807, "loss/crossentropy": 2.906672239303589, "loss/hidden": 7.609375, "loss/jsd": 0.0, "loss/logits": 0.9820225834846497, "step": 70 }, { "epoch": 0.0045, "grad_norm": 19.5, "grad_norm_var": 901.5062337239583, "learning_rate": 7.48e-05, "loss": 20.4968, "loss/crossentropy": 3.15093994140625, "loss/hidden": 7.578125, "loss/jsd": 0.0, "loss/logits": 1.0488424897193909, "step": 72 }, { "epoch": 0.004625, "grad_norm": 13.75, "grad_norm_var": 11.296207682291667, "learning_rate": 7.66e-05, "loss": 19.8961, "loss/crossentropy": 3.124137759208679, "loss/hidden": 7.34375, "loss/jsd": 0.0, "loss/logits": 0.9181300401687622, "step": 74 }, { "epoch": 0.00475, "grad_norm": 21.375, "grad_norm_var": 11.601676432291667, "learning_rate": 7.840000000000001e-05, "loss": 19.5071, "loss/crossentropy": 3.0490193367004395, "loss/hidden": 7.34375, "loss/jsd": 0.0, "loss/logits": 0.9544317126274109, "step": 76 }, { "epoch": 0.004875, "grad_norm": 16.625, "grad_norm_var": 11.332405598958333, "learning_rate": 8.020000000000001e-05, "loss": 19.1397, "loss/crossentropy": 2.9865410327911377, "loss/hidden": 7.234375, "loss/jsd": 0.0, "loss/logits": 0.9217109084129333, "step": 78 }, { "epoch": 0.005, "grad_norm": 13.375, "grad_norm_var": 10.337955729166667, "learning_rate": 8.200000000000001e-05, "loss": 18.751, "loss/crossentropy": 2.766746163368225, "loss/hidden": 7.109375, "loss/jsd": 0.0, "loss/logits": 0.8647720515727997, "step": 80 }, { "epoch": 0.005125, "grad_norm": 16.25, "grad_norm_var": 11.413802083333334, "learning_rate": 8.38e-05, "loss": 18.6336, "loss/crossentropy": 3.0917201042175293, "loss/hidden": 6.875, "loss/jsd": 0.0, "loss/logits": 0.8534270823001862, "step": 82 }, { "epoch": 0.00525, "grad_norm": 14.5, "grad_norm_var": 11.0328125, "learning_rate": 8.560000000000001e-05, "loss": 18.3569, "loss/crossentropy": 2.9698469638824463, "loss/hidden": 7.03125, "loss/jsd": 0.0, "loss/logits": 0.8276518881320953, "step": 84 }, { "epoch": 0.005375, "grad_norm": 11.625, "grad_norm_var": 11.439567057291667, "learning_rate": 8.740000000000001e-05, "loss": 18.0427, "loss/crossentropy": 2.968225121498108, "loss/hidden": 6.8125, "loss/jsd": 0.0, "loss/logits": 0.8551926612854004, "step": 86 }, { "epoch": 0.0055, "grad_norm": 15.875, "grad_norm_var": 11.452978515625, "learning_rate": 8.92e-05, "loss": 17.7372, "loss/crossentropy": 2.954566717147827, "loss/hidden": 6.65625, "loss/jsd": 0.0, "loss/logits": 0.8068905174732208, "step": 88 }, { "epoch": 0.005625, "grad_norm": 13.75, "grad_norm_var": 11.686572265625, "learning_rate": 9.1e-05, "loss": 17.6758, "loss/crossentropy": 3.231398582458496, "loss/hidden": 6.5625, "loss/jsd": 0.0, "loss/logits": 0.8298341929912567, "step": 90 }, { "epoch": 0.00575, "grad_norm": 11.0625, "grad_norm_var": 11.1984375, "learning_rate": 9.28e-05, "loss": 17.011, "loss/crossentropy": 2.629736542701721, "loss/hidden": 6.5, "loss/jsd": 0.0, "loss/logits": 0.7391310036182404, "step": 92 }, { "epoch": 0.005875, "grad_norm": 14.8125, "grad_norm_var": 10.108186848958333, "learning_rate": 9.46e-05, "loss": 17.0497, "loss/crossentropy": 2.803489089012146, "loss/hidden": 6.5625, "loss/jsd": 0.0, "loss/logits": 0.8144381940364838, "step": 94 }, { "epoch": 0.006, "grad_norm": 12.5625, "grad_norm_var": 10.022916666666667, "learning_rate": 9.64e-05, "loss": 16.5939, "loss/crossentropy": 3.019722819328308, "loss/hidden": 6.46875, "loss/jsd": 0.0, "loss/logits": 0.8095101118087769, "step": 96 }, { "epoch": 0.006125, "grad_norm": 15.5, "grad_norm_var": 4.770686848958333, "learning_rate": 9.82e-05, "loss": 16.5934, "loss/crossentropy": 2.80954647064209, "loss/hidden": 6.421875, "loss/jsd": 0.0, "loss/logits": 0.7483513355255127, "step": 98 }, { "epoch": 0.00625, "grad_norm": 13.5625, "grad_norm_var": 6.648893229166666, "learning_rate": 0.0001, "loss": 16.4383, "loss/crossentropy": 2.4182586669921875, "loss/hidden": 6.21875, "loss/jsd": 0.0, "loss/logits": 0.6813479959964752, "step": 100 }, { "epoch": 0.006375, "grad_norm": 15.6875, "grad_norm_var": 6.005322265625, "learning_rate": 0.0001, "loss": 16.3773, "loss/crossentropy": 2.767289161682129, "loss/hidden": 6.40625, "loss/jsd": 0.0, "loss/logits": 0.7437765300273895, "step": 102 }, { "epoch": 0.0065, "grad_norm": 13.9375, "grad_norm_var": 6.213004557291667, "learning_rate": 0.0001, "loss": 16.6123, "loss/crossentropy": 2.8596930503845215, "loss/hidden": 6.328125, "loss/jsd": 0.0, "loss/logits": 0.7475454211235046, "step": 104 }, { "epoch": 0.006625, "grad_norm": 11.1875, "grad_norm_var": 6.547900390625, "learning_rate": 0.0001, "loss": 16.4102, "loss/crossentropy": 3.031221389770508, "loss/hidden": 6.296875, "loss/jsd": 0.0, "loss/logits": 0.8019546568393707, "step": 106 }, { "epoch": 0.00675, "grad_norm": 12.8125, "grad_norm_var": 6.1900390625, "learning_rate": 0.0001, "loss": 15.9361, "loss/crossentropy": 2.9293311834335327, "loss/hidden": 6.125, "loss/jsd": 0.0, "loss/logits": 0.7174897193908691, "step": 108 }, { "epoch": 0.006875, "grad_norm": 13.5625, "grad_norm_var": 6.680452473958334, "learning_rate": 0.0001, "loss": 15.6893, "loss/crossentropy": 3.064392924308777, "loss/hidden": 6.03125, "loss/jsd": 0.0, "loss/logits": 0.7035545110702515, "step": 110 }, { "epoch": 0.007, "grad_norm": 11.125, "grad_norm_var": 6.978059895833334, "learning_rate": 0.0001, "loss": 15.9123, "loss/crossentropy": 2.668829083442688, "loss/hidden": 6.1875, "loss/jsd": 0.0, "loss/logits": 0.6724480390548706, "step": 112 }, { "epoch": 0.007125, "grad_norm": 14.1875, "grad_norm_var": 6.252083333333333, "learning_rate": 0.0001, "loss": 15.4118, "loss/crossentropy": 2.6381983757019043, "loss/hidden": 6.140625, "loss/jsd": 0.0, "loss/logits": 0.6702737212181091, "step": 114 }, { "epoch": 0.00725, "grad_norm": 10.0625, "grad_norm_var": 3.822379557291667, "learning_rate": 0.0001, "loss": 14.9318, "loss/crossentropy": 2.583546996116638, "loss/hidden": 5.96875, "loss/jsd": 0.0, "loss/logits": 0.65639927983284, "step": 116 }, { "epoch": 0.007375, "grad_norm": 10.3125, "grad_norm_var": 2.2548014322916665, "learning_rate": 0.0001, "loss": 15.1022, "loss/crossentropy": 2.70282781124115, "loss/hidden": 5.890625, "loss/jsd": 0.0, "loss/logits": 0.6296679973602295, "step": 118 }, { "epoch": 0.0075, "grad_norm": 10.8125, "grad_norm_var": 2.323681640625, "learning_rate": 0.0001, "loss": 14.9772, "loss/crossentropy": 2.714564800262451, "loss/hidden": 5.84375, "loss/jsd": 0.0, "loss/logits": 0.631663054227829, "step": 120 }, { "epoch": 0.007625, "grad_norm": 9.75, "grad_norm_var": 2.398811848958333, "learning_rate": 0.0001, "loss": 15.0945, "loss/crossentropy": 2.716395616531372, "loss/hidden": 5.84375, "loss/jsd": 0.0, "loss/logits": 0.648982048034668, "step": 122 }, { "epoch": 0.00775, "grad_norm": 13.3125, "grad_norm_var": 2.604150390625, "learning_rate": 0.0001, "loss": 15.3371, "loss/crossentropy": 2.7478760480880737, "loss/hidden": 5.890625, "loss/jsd": 0.0, "loss/logits": 0.6630505919456482, "step": 124 }, { "epoch": 0.007875, "grad_norm": 10.5625, "grad_norm_var": 2.456884765625, "learning_rate": 0.0001, "loss": 15.198, "loss/crossentropy": 2.681770443916321, "loss/hidden": 5.71875, "loss/jsd": 0.0, "loss/logits": 0.631014883518219, "step": 126 }, { "epoch": 0.008, "grad_norm": 9.8125, "grad_norm_var": 2.166650390625, "learning_rate": 0.0001, "loss": 14.9414, "loss/crossentropy": 2.5563963651657104, "loss/hidden": 5.796875, "loss/jsd": 0.0, "loss/logits": 0.6471997797489166, "step": 128 }, { "epoch": 0.008125, "grad_norm": 12.25, "grad_norm_var": 1.3471354166666667, "learning_rate": 0.0001, "loss": 14.2076, "loss/crossentropy": 2.4711976051330566, "loss/hidden": 5.59375, "loss/jsd": 0.0, "loss/logits": 0.6036389470100403, "step": 130 }, { "epoch": 0.00825, "grad_norm": 8.0625, "grad_norm_var": 1.8124348958333334, "learning_rate": 0.0001, "loss": 14.3162, "loss/crossentropy": 2.717278480529785, "loss/hidden": 5.71875, "loss/jsd": 0.0, "loss/logits": 0.5794410407543182, "step": 132 }, { "epoch": 0.008375, "grad_norm": 9.3125, "grad_norm_var": 1.694384765625, "learning_rate": 0.0001, "loss": 14.3277, "loss/crossentropy": 2.708119750022888, "loss/hidden": 5.609375, "loss/jsd": 0.0, "loss/logits": 0.6186929643154144, "step": 134 }, { "epoch": 0.0085, "grad_norm": 9.4375, "grad_norm_var": 1.8212890625, "learning_rate": 0.0001, "loss": 14.342, "loss/crossentropy": 2.626417875289917, "loss/hidden": 5.609375, "loss/jsd": 0.0, "loss/logits": 0.6479573547840118, "step": 136 }, { "epoch": 0.008625, "grad_norm": 9.375, "grad_norm_var": 1.9150390625, "learning_rate": 0.0001, "loss": 14.3908, "loss/crossentropy": 2.669326066970825, "loss/hidden": 5.59375, "loss/jsd": 0.0, "loss/logits": 0.5919320583343506, "step": 138 }, { "epoch": 0.00875, "grad_norm": 9.875, "grad_norm_var": 1.1773274739583333, "learning_rate": 0.0001, "loss": 14.3644, "loss/crossentropy": 2.523828148841858, "loss/hidden": 5.5625, "loss/jsd": 0.0, "loss/logits": 0.5997762680053711, "step": 140 }, { "epoch": 0.008875, "grad_norm": 10.3125, "grad_norm_var": 1.1572265625, "learning_rate": 0.0001, "loss": 13.9086, "loss/crossentropy": 2.4612866640090942, "loss/hidden": 5.5625, "loss/jsd": 0.0, "loss/logits": 0.5790427327156067, "step": 142 }, { "epoch": 0.009, "grad_norm": 9.3125, "grad_norm_var": 1.0247395833333333, "learning_rate": 0.0001, "loss": 14.3827, "loss/crossentropy": 2.3406189680099487, "loss/hidden": 5.53125, "loss/jsd": 0.0, "loss/logits": 0.5852385461330414, "step": 144 }, { "epoch": 0.009125, "grad_norm": 8.0625, "grad_norm_var": 0.7343098958333333, "learning_rate": 0.0001, "loss": 14.0124, "loss/crossentropy": 2.3046847581863403, "loss/hidden": 5.453125, "loss/jsd": 0.0, "loss/logits": 0.5159327685832977, "step": 146 }, { "epoch": 0.00925, "grad_norm": 14.1875, "grad_norm_var": 1.8917805989583334, "learning_rate": 0.0001, "loss": 14.2854, "loss/crossentropy": 2.673864483833313, "loss/hidden": 5.421875, "loss/jsd": 0.0, "loss/logits": 0.568247377872467, "step": 148 }, { "epoch": 0.009375, "grad_norm": 7.59375, "grad_norm_var": 2.16685791015625, "learning_rate": 0.0001, "loss": 13.7424, "loss/crossentropy": 2.4085657596588135, "loss/hidden": 5.328125, "loss/jsd": 0.0, "loss/logits": 0.576050728559494, "step": 150 }, { "epoch": 0.0095, "grad_norm": 9.625, "grad_norm_var": 2.378759765625, "learning_rate": 0.0001, "loss": 13.8437, "loss/crossentropy": 2.6601611375808716, "loss/hidden": 5.40625, "loss/jsd": 0.0, "loss/logits": 0.5511062890291214, "step": 152 }, { "epoch": 0.009625, "grad_norm": 8.375, "grad_norm_var": 2.7024576822916666, "learning_rate": 0.0001, "loss": 13.9772, "loss/crossentropy": 2.6024333238601685, "loss/hidden": 5.390625, "loss/jsd": 0.0, "loss/logits": 0.6262157559394836, "step": 154 }, { "epoch": 0.00975, "grad_norm": 7.875, "grad_norm_var": 3.062744140625, "learning_rate": 0.0001, "loss": 13.9552, "loss/crossentropy": 2.8262619972229004, "loss/hidden": 5.40625, "loss/jsd": 0.0, "loss/logits": 0.5858268141746521, "step": 156 }, { "epoch": 0.009875, "grad_norm": 7.46875, "grad_norm_var": 3.2928670247395835, "learning_rate": 0.0001, "loss": 13.5471, "loss/crossentropy": 2.4514461755752563, "loss/hidden": 5.4375, "loss/jsd": 0.0, "loss/logits": 0.5691545605659485, "step": 158 }, { "epoch": 0.01, "grad_norm": 10.125, "grad_norm_var": 3.296317545572917, "learning_rate": 0.0001, "loss": 13.7877, "loss/crossentropy": 2.5704935789108276, "loss/hidden": 5.390625, "loss/jsd": 0.0, "loss/logits": 0.5797623991966248, "step": 160 }, { "epoch": 0.010125, "grad_norm": 8.9375, "grad_norm_var": 3.193648274739583, "learning_rate": 0.0001, "loss": 13.5383, "loss/crossentropy": 2.5421087741851807, "loss/hidden": 5.296875, "loss/jsd": 0.0, "loss/logits": 0.5653413534164429, "step": 162 }, { "epoch": 0.01025, "grad_norm": 9.375, "grad_norm_var": 1.48746337890625, "learning_rate": 0.0001, "loss": 13.5876, "loss/crossentropy": 2.5322933197021484, "loss/hidden": 5.34375, "loss/jsd": 0.0, "loss/logits": 0.5471592545509338, "step": 164 }, { "epoch": 0.010375, "grad_norm": 7.71875, "grad_norm_var": 1.2683878580729167, "learning_rate": 0.0001, "loss": 13.3671, "loss/crossentropy": 2.5972307920455933, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5242535173892975, "step": 166 }, { "epoch": 0.0105, "grad_norm": 7.71875, "grad_norm_var": 1.3320149739583333, "learning_rate": 0.0001, "loss": 13.727, "loss/crossentropy": 2.654882788658142, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5885663330554962, "step": 168 }, { "epoch": 0.010625, "grad_norm": 8.6875, "grad_norm_var": 0.7624348958333333, "learning_rate": 0.0001, "loss": 13.6388, "loss/crossentropy": 2.577346444129944, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5597317218780518, "step": 170 }, { "epoch": 0.01075, "grad_norm": 7.78125, "grad_norm_var": 0.7722615559895833, "learning_rate": 0.0001, "loss": 13.3253, "loss/crossentropy": 2.513775587081909, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5256665050983429, "step": 172 }, { "epoch": 0.010875, "grad_norm": 7.75, "grad_norm_var": 0.7695149739583333, "learning_rate": 0.0001, "loss": 13.1859, "loss/crossentropy": 2.507006049156189, "loss/hidden": 5.171875, "loss/jsd": 0.0, "loss/logits": 0.5298479795455933, "step": 174 }, { "epoch": 0.011, "grad_norm": 10.5, "grad_norm_var": 0.8642578125, "learning_rate": 0.0001, "loss": 13.3871, "loss/crossentropy": 2.546314835548401, "loss/hidden": 5.203125, "loss/jsd": 0.0, "loss/logits": 0.5244591236114502, "step": 176 }, { "epoch": 0.011125, "grad_norm": 7.71875, "grad_norm_var": 1.03707275390625, "learning_rate": 0.0001, "loss": 13.6056, "loss/crossentropy": 2.5370965003967285, "loss/hidden": 5.15625, "loss/jsd": 0.0, "loss/logits": 0.5421458780765533, "step": 178 }, { "epoch": 0.01125, "grad_norm": 7.8125, "grad_norm_var": 1.2237589518229166, "learning_rate": 0.0001, "loss": 13.6754, "loss/crossentropy": 2.702122926712036, "loss/hidden": 5.265625, "loss/jsd": 0.0, "loss/logits": 0.5821575820446014, "step": 180 }, { "epoch": 0.011375, "grad_norm": 10.1875, "grad_norm_var": 1.307666015625, "learning_rate": 0.0001, "loss": 13.4621, "loss/crossentropy": 2.6636409759521484, "loss/hidden": 5.1875, "loss/jsd": 0.0, "loss/logits": 0.5116533488035202, "step": 182 }, { "epoch": 0.0115, "grad_norm": 7.375, "grad_norm_var": 1.208984375, "learning_rate": 0.0001, "loss": 13.4015, "loss/crossentropy": 2.608703851699829, "loss/hidden": 5.203125, "loss/jsd": 0.0, "loss/logits": 0.5401088297367096, "step": 184 }, { "epoch": 0.011625, "grad_norm": 8.5625, "grad_norm_var": 1.2628214518229166, "learning_rate": 0.0001, "loss": 13.3483, "loss/crossentropy": 2.604197859764099, "loss/hidden": 5.296875, "loss/jsd": 0.0, "loss/logits": 0.5501621663570404, "step": 186 }, { "epoch": 0.01175, "grad_norm": 8.0625, "grad_norm_var": 1.2360514322916667, "learning_rate": 0.0001, "loss": 13.2514, "loss/crossentropy": 2.384081482887268, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.5311485826969147, "step": 188 }, { "epoch": 0.011875, "grad_norm": 7.5, "grad_norm_var": 1.4585896809895833, "learning_rate": 0.0001, "loss": 12.9879, "loss/crossentropy": 2.501517176628113, "loss/hidden": 5.0625, "loss/jsd": 0.0, "loss/logits": 0.5271838307380676, "step": 190 }, { "epoch": 0.012, "grad_norm": 9.125, "grad_norm_var": 1.30015869140625, "learning_rate": 0.0001, "loss": 13.1282, "loss/crossentropy": 2.4795455932617188, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.5566309094429016, "step": 192 }, { "epoch": 0.012125, "grad_norm": 7.5625, "grad_norm_var": 1.1060506184895833, "learning_rate": 0.0001, "loss": 13.0203, "loss/crossentropy": 2.615292191505432, "loss/hidden": 5.0625, "loss/jsd": 0.0, "loss/logits": 0.5351596176624298, "step": 194 }, { "epoch": 0.01225, "grad_norm": 8.4375, "grad_norm_var": 0.74791259765625, "learning_rate": 0.0001, "loss": 13.2183, "loss/crossentropy": 2.777504324913025, "loss/hidden": 5.0625, "loss/jsd": 0.0, "loss/logits": 0.5449462532997131, "step": 196 }, { "epoch": 0.012375, "grad_norm": 8.25, "grad_norm_var": 0.42310791015625, "learning_rate": 0.0001, "loss": 13.2077, "loss/crossentropy": 2.8624597787857056, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.546259343624115, "step": 198 }, { "epoch": 0.0125, "grad_norm": 8.1875, "grad_norm_var": 0.5727498372395833, "learning_rate": 0.0001, "loss": 13.1209, "loss/crossentropy": 2.6138558387756348, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.5038184821605682, "step": 200 }, { "epoch": 0.012625, "grad_norm": 7.625, "grad_norm_var": 0.5636678059895833, "learning_rate": 0.0001, "loss": 12.7479, "loss/crossentropy": 2.6792138814926147, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.5708663761615753, "step": 202 }, { "epoch": 0.01275, "grad_norm": 10.1875, "grad_norm_var": 0.7628743489583333, "learning_rate": 0.0001, "loss": 12.7616, "loss/crossentropy": 2.436267137527466, "loss/hidden": 5.015625, "loss/jsd": 0.0, "loss/logits": 0.56131911277771, "step": 204 }, { "epoch": 0.012875, "grad_norm": 8.3125, "grad_norm_var": 1.3605428059895834, "learning_rate": 0.0001, "loss": 12.5898, "loss/crossentropy": 2.3147178888320923, "loss/hidden": 5.0, "loss/jsd": 0.0, "loss/logits": 0.48664502799510956, "step": 206 }, { "epoch": 0.013, "grad_norm": 6.3125, "grad_norm_var": 1.7156209309895833, "learning_rate": 0.0001, "loss": 12.1952, "loss/crossentropy": 2.36074697971344, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.4715626835823059, "step": 208 }, { "epoch": 0.013125, "grad_norm": 7.65625, "grad_norm_var": 1.71051025390625, "learning_rate": 0.0001, "loss": 12.6705, "loss/crossentropy": 2.585902214050293, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.5059193968772888, "step": 210 }, { "epoch": 0.01325, "grad_norm": 7.5625, "grad_norm_var": 1.7617146809895834, "learning_rate": 0.0001, "loss": 12.6794, "loss/crossentropy": 2.485123038291931, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.5168211758136749, "step": 212 }, { "epoch": 0.013375, "grad_norm": 6.28125, "grad_norm_var": 1.9130167643229166, "learning_rate": 0.0001, "loss": 12.3766, "loss/crossentropy": 2.730853319168091, "loss/hidden": 4.96875, "loss/jsd": 0.0, "loss/logits": 0.5259307771921158, "step": 214 }, { "epoch": 0.0135, "grad_norm": 7.5, "grad_norm_var": 1.82261962890625, "learning_rate": 0.0001, "loss": 12.2796, "loss/crossentropy": 2.5561338663101196, "loss/hidden": 4.875, "loss/jsd": 0.0, "loss/logits": 0.5048320591449738, "step": 216 }, { "epoch": 0.013625, "grad_norm": 7.03125, "grad_norm_var": 1.89547119140625, "learning_rate": 0.0001, "loss": 12.4252, "loss/crossentropy": 2.579685091972351, "loss/hidden": 4.953125, "loss/jsd": 0.0, "loss/logits": 0.4986896812915802, "step": 218 }, { "epoch": 0.01375, "grad_norm": 6.875, "grad_norm_var": 1.7075520833333333, "learning_rate": 0.0001, "loss": 12.2975, "loss/crossentropy": 2.646655321121216, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.488240122795105, "step": 220 }, { "epoch": 0.013875, "grad_norm": 7.15625, "grad_norm_var": 0.49394124348958335, "learning_rate": 0.0001, "loss": 12.4559, "loss/crossentropy": 2.576615571975708, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.4858951270580292, "step": 222 }, { "epoch": 0.014, "grad_norm": 7.28125, "grad_norm_var": 0.4115193684895833, "learning_rate": 0.0001, "loss": 12.0337, "loss/crossentropy": 2.532575011253357, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.5103432983160019, "step": 224 }, { "epoch": 0.014125, "grad_norm": 6.375, "grad_norm_var": 0.3961588541666667, "learning_rate": 0.0001, "loss": 12.3373, "loss/crossentropy": 2.6340404748916626, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.4943613111972809, "step": 226 }, { "epoch": 0.01425, "grad_norm": 10.1875, "grad_norm_var": 1.01871337890625, "learning_rate": 0.0001, "loss": 12.8549, "loss/crossentropy": 2.454265832901001, "loss/hidden": 4.703125, "loss/jsd": 0.0, "loss/logits": 0.45377302169799805, "step": 228 }, { "epoch": 0.014375, "grad_norm": 7.65625, "grad_norm_var": 0.9704427083333333, "learning_rate": 0.0001, "loss": 12.2599, "loss/crossentropy": 2.6582183837890625, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.5020380616188049, "step": 230 }, { "epoch": 0.0145, "grad_norm": 7.5, "grad_norm_var": 1.1243326822916666, "learning_rate": 0.0001, "loss": 12.6468, "loss/crossentropy": 2.657613158226013, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.5409696698188782, "step": 232 }, { "epoch": 0.014625, "grad_norm": 7.0625, "grad_norm_var": 1.0277628580729166, "learning_rate": 0.0001, "loss": 12.4568, "loss/crossentropy": 2.5104974508285522, "loss/hidden": 4.859375, "loss/jsd": 0.0, "loss/logits": 0.4923330545425415, "step": 234 }, { "epoch": 0.01475, "grad_norm": 7.59375, "grad_norm_var": 0.8995930989583333, "learning_rate": 0.0001, "loss": 11.9849, "loss/crossentropy": 2.52110493183136, "loss/hidden": 4.828125, "loss/jsd": 0.0, "loss/logits": 0.4490865021944046, "step": 236 }, { "epoch": 0.014875, "grad_norm": 6.84375, "grad_norm_var": 1.0208984375, "learning_rate": 0.0001, "loss": 12.546, "loss/crossentropy": 2.8875030279159546, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.5406565517187119, "step": 238 }, { "epoch": 0.015, "grad_norm": 6.0, "grad_norm_var": 1.1588175455729166, "learning_rate": 0.0001, "loss": 12.2478, "loss/crossentropy": 2.5239052772521973, "loss/hidden": 4.84375, "loss/jsd": 0.0, "loss/logits": 0.4846698194742203, "step": 240 }, { "epoch": 0.015125, "grad_norm": 6.40625, "grad_norm_var": 1.221728515625, "learning_rate": 0.0001, "loss": 12.3295, "loss/crossentropy": 2.4903002977371216, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.48552611470222473, "step": 242 }, { "epoch": 0.01525, "grad_norm": 6.9375, "grad_norm_var": 0.6412760416666666, "learning_rate": 0.0001, "loss": 12.0959, "loss/crossentropy": 2.723002791404724, "loss/hidden": 4.796875, "loss/jsd": 0.0, "loss/logits": 0.4821483790874481, "step": 244 }, { "epoch": 0.015375, "grad_norm": 5.90625, "grad_norm_var": 0.7720662434895833, "learning_rate": 0.0001, "loss": 12.21, "loss/crossentropy": 2.433968424797058, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.47880972921848297, "step": 246 }, { "epoch": 0.0155, "grad_norm": 8.5, "grad_norm_var": 0.6696248372395833, "learning_rate": 0.0001, "loss": 12.3585, "loss/crossentropy": 2.672105550765991, "loss/hidden": 4.859375, "loss/jsd": 0.0, "loss/logits": 0.5268009155988693, "step": 248 }, { "epoch": 0.015625, "grad_norm": 6.4375, "grad_norm_var": 0.7093587239583333, "learning_rate": 0.0001, "loss": 11.961, "loss/crossentropy": 2.404228091239929, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.45947059988975525, "step": 250 }, { "epoch": 0.01575, "grad_norm": 6.5625, "grad_norm_var": 0.6781209309895834, "learning_rate": 0.0001, "loss": 11.9045, "loss/crossentropy": 2.5632474422454834, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.49997828900814056, "step": 252 }, { "epoch": 0.015875, "grad_norm": 6.28125, "grad_norm_var": 0.38396809895833334, "learning_rate": 0.0001, "loss": 11.9424, "loss/crossentropy": 2.7031558752059937, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.487554132938385, "step": 254 }, { "epoch": 0.016, "grad_norm": 7.15625, "grad_norm_var": 0.36288655598958336, "learning_rate": 0.0001, "loss": 12.2919, "loss/crossentropy": 2.5938860177993774, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.5034405440092087, "step": 256 }, { "epoch": 0.016125, "grad_norm": 5.96875, "grad_norm_var": 0.37994384765625, "learning_rate": 0.0001, "loss": 12.0851, "loss/crossentropy": 2.503898859024048, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.45617610216140747, "step": 258 }, { "epoch": 0.01625, "grad_norm": 7.25, "grad_norm_var": 0.39628499348958335, "learning_rate": 0.0001, "loss": 12.2487, "loss/crossentropy": 2.721401333808899, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.5100695192813873, "step": 260 }, { "epoch": 0.016375, "grad_norm": 6.5, "grad_norm_var": 0.3338541666666667, "learning_rate": 0.0001, "loss": 11.701, "loss/crossentropy": 2.6579989194869995, "loss/hidden": 4.671875, "loss/jsd": 0.0, "loss/logits": 0.4803234338760376, "step": 262 }, { "epoch": 0.0165, "grad_norm": 6.3125, "grad_norm_var": 0.11678059895833333, "learning_rate": 0.0001, "loss": 11.7867, "loss/crossentropy": 2.5109288692474365, "loss/hidden": 4.734375, "loss/jsd": 0.0, "loss/logits": 0.4530385881662369, "step": 264 }, { "epoch": 0.016625, "grad_norm": 6.875, "grad_norm_var": 0.12444254557291666, "learning_rate": 0.0001, "loss": 12.4496, "loss/crossentropy": 2.5854159593582153, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.4961777627468109, "step": 266 }, { "epoch": 0.01675, "grad_norm": 6.0625, "grad_norm_var": 0.17303059895833334, "learning_rate": 0.0001, "loss": 11.7866, "loss/crossentropy": 2.411824584007263, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.43849293887615204, "step": 268 }, { "epoch": 0.016875, "grad_norm": 6.875, "grad_norm_var": 0.17593994140625, "learning_rate": 0.0001, "loss": 11.6888, "loss/crossentropy": 2.568202257156372, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.4507843852043152, "step": 270 }, { "epoch": 0.017, "grad_norm": 6.8125, "grad_norm_var": 0.16825764973958332, "learning_rate": 0.0001, "loss": 12.0805, "loss/crossentropy": 2.734534502029419, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.47038406133651733, "step": 272 }, { "epoch": 0.017125, "grad_norm": 6.0625, "grad_norm_var": 0.18333333333333332, "learning_rate": 0.0001, "loss": 11.6923, "loss/crossentropy": 2.545106887817383, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.4470268785953522, "step": 274 }, { "epoch": 0.01725, "grad_norm": 6.75, "grad_norm_var": 0.15310872395833333, "learning_rate": 0.0001, "loss": 11.8355, "loss/crossentropy": 2.5739216804504395, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.5115124732255936, "step": 276 }, { "epoch": 0.017375, "grad_norm": 6.25, "grad_norm_var": 0.15276285807291667, "learning_rate": 0.0001, "loss": 12.1692, "loss/crossentropy": 2.8947519063949585, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.5023118555545807, "step": 278 }, { "epoch": 0.0175, "grad_norm": 6.34375, "grad_norm_var": 0.16066080729166668, "learning_rate": 0.0001, "loss": 11.3566, "loss/crossentropy": 2.4826793670654297, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.45133158564567566, "step": 280 }, { "epoch": 0.017625, "grad_norm": 6.03125, "grad_norm_var": 0.142041015625, "learning_rate": 0.0001, "loss": 11.841, "loss/crossentropy": 2.4146469831466675, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.4536858946084976, "step": 282 }, { "epoch": 0.01775, "grad_norm": 7.28125, "grad_norm_var": 0.18800455729166668, "learning_rate": 0.0001, "loss": 11.688, "loss/crossentropy": 2.450984477996826, "loss/hidden": 4.671875, "loss/jsd": 0.0, "loss/logits": 0.4384435713291168, "step": 284 }, { "epoch": 0.017875, "grad_norm": 6.25, "grad_norm_var": 0.18196207682291668, "learning_rate": 0.0001, "loss": 11.8491, "loss/crossentropy": 2.374552607536316, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.42816005647182465, "step": 286 }, { "epoch": 0.018, "grad_norm": 5.625, "grad_norm_var": 0.16256510416666667, "learning_rate": 0.0001, "loss": 11.6167, "loss/crossentropy": 2.4289149045944214, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.4989480674266815, "step": 288 }, { "epoch": 0.018125, "grad_norm": 6.46875, "grad_norm_var": 0.16741129557291667, "learning_rate": 0.0001, "loss": 11.5302, "loss/crossentropy": 2.5054534673690796, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.45670510828495026, "step": 290 }, { "epoch": 0.01825, "grad_norm": 5.46875, "grad_norm_var": 0.18970947265625, "learning_rate": 0.0001, "loss": 11.3159, "loss/crossentropy": 2.421309471130371, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.4126213788986206, "step": 292 }, { "epoch": 0.018375, "grad_norm": 6.71875, "grad_norm_var": 0.21484375, "learning_rate": 0.0001, "loss": 11.7242, "loss/crossentropy": 2.6865302324295044, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.48639141023159027, "step": 294 }, { "epoch": 0.0185, "grad_norm": 6.71875, "grad_norm_var": 0.23710530598958332, "learning_rate": 0.0001, "loss": 11.7623, "loss/crossentropy": 2.530078172683716, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.4684665650129318, "step": 296 }, { "epoch": 0.018625, "grad_norm": 5.8125, "grad_norm_var": 0.24947916666666667, "learning_rate": 0.0001, "loss": 11.4115, "loss/crossentropy": 2.2663209438323975, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.4574204683303833, "step": 298 }, { "epoch": 0.01875, "grad_norm": 6.0, "grad_norm_var": 0.17994384765625, "learning_rate": 0.0001, "loss": 11.4217, "loss/crossentropy": 2.260213851928711, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.48128053545951843, "step": 300 }, { "epoch": 0.018875, "grad_norm": 6.0, "grad_norm_var": 0.15660400390625, "learning_rate": 0.0001, "loss": 11.3582, "loss/crossentropy": 2.445768356323242, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.44302406907081604, "step": 302 }, { "epoch": 0.019, "grad_norm": 7.40625, "grad_norm_var": 0.284619140625, "learning_rate": 0.0001, "loss": 11.3538, "loss/crossentropy": 2.497570037841797, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.42873415350914, "step": 304 }, { "epoch": 0.019125, "grad_norm": 5.375, "grad_norm_var": 0.33358968098958336, "learning_rate": 0.0001, "loss": 11.704, "loss/crossentropy": 2.4702342748641968, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.4643610119819641, "step": 306 }, { "epoch": 0.01925, "grad_norm": 5.875, "grad_norm_var": 0.3282389322916667, "learning_rate": 0.0001, "loss": 11.5569, "loss/crossentropy": 2.613131880760193, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.4228426665067673, "step": 308 }, { "epoch": 0.019375, "grad_norm": 5.9375, "grad_norm_var": 0.32916259765625, "learning_rate": 0.0001, "loss": 11.82, "loss/crossentropy": 2.655569911003113, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.4577495604753494, "step": 310 }, { "epoch": 0.0195, "grad_norm": 5.5, "grad_norm_var": 0.396875, "learning_rate": 0.0001, "loss": 11.2265, "loss/crossentropy": 2.5523258447647095, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.4802166670560837, "step": 312 }, { "epoch": 0.019625, "grad_norm": 6.0625, "grad_norm_var": 0.4364420572916667, "learning_rate": 0.0001, "loss": 11.369, "loss/crossentropy": 2.7622017860412598, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.4416071027517319, "step": 314 }, { "epoch": 0.01975, "grad_norm": 6.46875, "grad_norm_var": 0.43814697265625, "learning_rate": 0.0001, "loss": 11.5738, "loss/crossentropy": 2.592649459838867, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.4513923078775406, "step": 316 }, { "epoch": 0.019875, "grad_norm": 6.90625, "grad_norm_var": 0.48209228515625, "learning_rate": 0.0001, "loss": 11.6672, "loss/crossentropy": 2.4992705583572388, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.4300658255815506, "step": 318 }, { "epoch": 0.02, "grad_norm": 5.84375, "grad_norm_var": 0.34729410807291666, "learning_rate": 0.0001, "loss": 11.2722, "loss/crossentropy": 2.493373394012451, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.442849725484848, "step": 320 }, { "epoch": 0.020125, "grad_norm": 6.28125, "grad_norm_var": 0.30530192057291666, "learning_rate": 0.0001, "loss": 11.5529, "loss/crossentropy": 2.547645926475525, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.48772141337394714, "step": 322 }, { "epoch": 0.02025, "grad_norm": 5.6875, "grad_norm_var": 0.30276285807291664, "learning_rate": 0.0001, "loss": 11.7243, "loss/crossentropy": 2.523668885231018, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.46751105785369873, "step": 324 }, { "epoch": 0.020375, "grad_norm": 5.5, "grad_norm_var": 0.25611979166666665, "learning_rate": 0.0001, "loss": 11.3933, "loss/crossentropy": 2.300553321838379, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.4517511576414108, "step": 326 }, { "epoch": 0.0205, "grad_norm": 6.46875, "grad_norm_var": 0.2505859375, "learning_rate": 0.0001, "loss": 11.1293, "loss/crossentropy": 2.2729378938674927, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.40413306653499603, "step": 328 }, { "epoch": 0.020625, "grad_norm": 6.0, "grad_norm_var": 0.21888020833333333, "learning_rate": 0.0001, "loss": 11.1315, "loss/crossentropy": 2.4203063249588013, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.4310520142316818, "step": 330 }, { "epoch": 0.02075, "grad_norm": 5.65625, "grad_norm_var": 0.23023681640625, "learning_rate": 0.0001, "loss": 11.1679, "loss/crossentropy": 2.399292469024658, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.42272189259529114, "step": 332 }, { "epoch": 0.020875, "grad_norm": 6.625, "grad_norm_var": 0.326953125, "learning_rate": 0.0001, "loss": 11.5921, "loss/crossentropy": 2.7525601387023926, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.4559982270002365, "step": 334 }, { "epoch": 0.021, "grad_norm": 4.71875, "grad_norm_var": 0.3798136393229167, "learning_rate": 0.0001, "loss": 11.1814, "loss/crossentropy": 2.5639878511428833, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.4693567156791687, "step": 336 }, { "epoch": 0.021125, "grad_norm": 5.15625, "grad_norm_var": 0.3953125, "learning_rate": 0.0001, "loss": 11.0082, "loss/crossentropy": 2.3304390907287598, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.4606510251760483, "step": 338 }, { "epoch": 0.02125, "grad_norm": 6.59375, "grad_norm_var": 0.4320597330729167, "learning_rate": 0.0001, "loss": 11.3795, "loss/crossentropy": 2.3963816165924072, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.4275331646203995, "step": 340 }, { "epoch": 0.021375, "grad_norm": 5.34375, "grad_norm_var": 0.53834228515625, "learning_rate": 0.0001, "loss": 11.5201, "loss/crossentropy": 2.6739399433135986, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.5361433923244476, "step": 342 }, { "epoch": 0.0215, "grad_norm": 6.28125, "grad_norm_var": 0.532421875, "learning_rate": 0.0001, "loss": 11.2029, "loss/crossentropy": 2.246910810470581, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.40490400791168213, "step": 344 }, { "epoch": 0.021625, "grad_norm": 5.0625, "grad_norm_var": 0.5819010416666667, "learning_rate": 0.0001, "loss": 11.0779, "loss/crossentropy": 2.273064136505127, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.4033001661300659, "step": 346 }, { "epoch": 0.02175, "grad_norm": 6.1875, "grad_norm_var": 0.559375, "learning_rate": 0.0001, "loss": 11.326, "loss/crossentropy": 2.379375696182251, "loss/hidden": 4.34375, "loss/jsd": 0.0, "loss/logits": 0.4155968874692917, "step": 348 }, { "epoch": 0.021875, "grad_norm": 5.5, "grad_norm_var": 0.3761678059895833, "learning_rate": 0.0001, "loss": 11.0718, "loss/crossentropy": 2.4657520055770874, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4061667025089264, "step": 350 }, { "epoch": 0.022, "grad_norm": 5.8125, "grad_norm_var": 0.30662434895833335, "learning_rate": 0.0001, "loss": 11.0622, "loss/crossentropy": 2.480490565299988, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.43048132956027985, "step": 352 }, { "epoch": 0.022125, "grad_norm": 5.96875, "grad_norm_var": 0.28765869140625, "learning_rate": 0.0001, "loss": 11.5931, "loss/crossentropy": 2.6550657749176025, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.4658609926700592, "step": 354 }, { "epoch": 0.02225, "grad_norm": 5.5625, "grad_norm_var": 0.26099853515625, "learning_rate": 0.0001, "loss": 11.0385, "loss/crossentropy": 2.5135127305984497, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.4206026792526245, "step": 356 }, { "epoch": 0.022375, "grad_norm": 5.28125, "grad_norm_var": 0.15416259765625, "learning_rate": 0.0001, "loss": 11.2004, "loss/crossentropy": 2.5652741193771362, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.41297411918640137, "step": 358 }, { "epoch": 0.0225, "grad_norm": 6.0, "grad_norm_var": 0.13004150390625, "learning_rate": 0.0001, "loss": 10.9409, "loss/crossentropy": 2.472624897956848, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.4368878901004791, "step": 360 }, { "epoch": 0.022625, "grad_norm": 6.34375, "grad_norm_var": 0.12721354166666668, "learning_rate": 0.0001, "loss": 11.2239, "loss/crossentropy": 2.3381993770599365, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.40575116872787476, "step": 362 }, { "epoch": 0.02275, "grad_norm": 6.0, "grad_norm_var": 0.11730143229166666, "learning_rate": 0.0001, "loss": 11.1714, "loss/crossentropy": 2.4883733987808228, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.44851288199424744, "step": 364 }, { "epoch": 0.022875, "grad_norm": 5.4375, "grad_norm_var": 0.160791015625, "learning_rate": 0.0001, "loss": 11.059, "loss/crossentropy": 2.4785518646240234, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4333874583244324, "step": 366 }, { "epoch": 0.023, "grad_norm": 5.53125, "grad_norm_var": 0.16568603515625, "learning_rate": 0.0001, "loss": 10.9517, "loss/crossentropy": 2.678915023803711, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.41231468319892883, "step": 368 }, { "epoch": 0.023125, "grad_norm": 5.15625, "grad_norm_var": 0.22571207682291666, "learning_rate": 0.0001, "loss": 10.9966, "loss/crossentropy": 2.273219585418701, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.42474931478500366, "step": 370 }, { "epoch": 0.02325, "grad_norm": 5.875, "grad_norm_var": 0.21131184895833333, "learning_rate": 0.0001, "loss": 11.0027, "loss/crossentropy": 2.3425220251083374, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.42784735560417175, "step": 372 }, { "epoch": 0.023375, "grad_norm": 5.28125, "grad_norm_var": 0.26174723307291664, "learning_rate": 0.0001, "loss": 11.0737, "loss/crossentropy": 2.5677989721298218, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.42918097972869873, "step": 374 }, { "epoch": 0.0235, "grad_norm": 5.6875, "grad_norm_var": 0.25816650390625, "learning_rate": 0.0001, "loss": 11.175, "loss/crossentropy": 2.4179869890213013, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.3684898316860199, "step": 376 }, { "epoch": 0.023625, "grad_norm": 5.375, "grad_norm_var": 0.20123291015625, "learning_rate": 0.0001, "loss": 10.9388, "loss/crossentropy": 2.52616810798645, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.43494799733161926, "step": 378 }, { "epoch": 0.02375, "grad_norm": 5.4375, "grad_norm_var": 0.18599853515625, "learning_rate": 0.0001, "loss": 10.9656, "loss/crossentropy": 2.2381917238235474, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4034838378429413, "step": 380 }, { "epoch": 0.023875, "grad_norm": 5.40625, "grad_norm_var": 0.16360677083333333, "learning_rate": 0.0001, "loss": 10.9349, "loss/crossentropy": 2.5972191095352173, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.41289472579956055, "step": 382 }, { "epoch": 0.024, "grad_norm": 6.9375, "grad_norm_var": 0.2912109375, "learning_rate": 0.0001, "loss": 10.8274, "loss/crossentropy": 2.310404658317566, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.4219682365655899, "step": 384 }, { "epoch": 0.024125, "grad_norm": 5.03125, "grad_norm_var": 0.25735677083333336, "learning_rate": 0.0001, "loss": 10.7633, "loss/crossentropy": 2.5043996572494507, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.3986252546310425, "step": 386 }, { "epoch": 0.02425, "grad_norm": 5.46875, "grad_norm_var": 0.2723307291666667, "learning_rate": 0.0001, "loss": 10.7861, "loss/crossentropy": 2.4293577671051025, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.422202467918396, "step": 388 }, { "epoch": 0.024375, "grad_norm": 4.59375, "grad_norm_var": 0.29833577473958334, "learning_rate": 0.0001, "loss": 11.0955, "loss/crossentropy": 2.5617754459381104, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4159541577100754, "step": 390 }, { "epoch": 0.0245, "grad_norm": 6.375, "grad_norm_var": 0.3329264322916667, "learning_rate": 0.0001, "loss": 10.8782, "loss/crossentropy": 2.6902949810028076, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4204605668783188, "step": 392 }, { "epoch": 0.024625, "grad_norm": 4.75, "grad_norm_var": 0.3837890625, "learning_rate": 0.0001, "loss": 10.5797, "loss/crossentropy": 2.579855442047119, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.3951130211353302, "step": 394 }, { "epoch": 0.02475, "grad_norm": 5.28125, "grad_norm_var": 0.3661295572916667, "learning_rate": 0.0001, "loss": 10.9375, "loss/crossentropy": 2.575868844985962, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.4067578613758087, "step": 396 }, { "epoch": 0.024875, "grad_norm": 5.53125, "grad_norm_var": 0.3630859375, "learning_rate": 0.0001, "loss": 11.1795, "loss/crossentropy": 2.3657878637313843, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4360276013612747, "step": 398 }, { "epoch": 0.025, "grad_norm": 5.0, "grad_norm_var": 0.2508748372395833, "learning_rate": 0.0001, "loss": 10.8242, "loss/crossentropy": 2.3945010900497437, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.4413905292749405, "step": 400 }, { "epoch": 0.025125, "grad_norm": 5.25, "grad_norm_var": 0.2595703125, "learning_rate": 0.0001, "loss": 10.5581, "loss/crossentropy": 2.1977953910827637, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3550948351621628, "step": 402 }, { "epoch": 0.02525, "grad_norm": 4.875, "grad_norm_var": 0.23834635416666666, "learning_rate": 0.0001, "loss": 10.6479, "loss/crossentropy": 2.3345136642456055, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.38985343277454376, "step": 404 }, { "epoch": 0.025375, "grad_norm": 5.46875, "grad_norm_var": 0.15650634765625, "learning_rate": 0.0001, "loss": 10.6649, "loss/crossentropy": 2.44900119304657, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.4164131134748459, "step": 406 }, { "epoch": 0.0255, "grad_norm": 4.75, "grad_norm_var": 0.083203125, "learning_rate": 0.0001, "loss": 10.6422, "loss/crossentropy": 2.4326157569885254, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.409572958946228, "step": 408 }, { "epoch": 0.025625, "grad_norm": 5.21875, "grad_norm_var": 0.06988525390625, "learning_rate": 0.0001, "loss": 10.6566, "loss/crossentropy": 2.4960622787475586, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.41588515043258667, "step": 410 }, { "epoch": 0.02575, "grad_norm": 5.3125, "grad_norm_var": 0.06795247395833333, "learning_rate": 0.0001, "loss": 10.7553, "loss/crossentropy": 2.590168833732605, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4012999087572098, "step": 412 }, { "epoch": 0.025875, "grad_norm": 5.65625, "grad_norm_var": 0.6649739583333333, "learning_rate": 0.0001, "loss": 11.135, "loss/crossentropy": 2.545465111732483, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.4800722002983093, "step": 414 }, { "epoch": 0.026, "grad_norm": 5.21875, "grad_norm_var": 0.6665974934895833, "learning_rate": 0.0001, "loss": 10.9887, "loss/crossentropy": 2.4133318662643433, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.4137228727340698, "step": 416 }, { "epoch": 0.026125, "grad_norm": 5.09375, "grad_norm_var": 0.6603474934895833, "learning_rate": 0.0001, "loss": 10.5496, "loss/crossentropy": 2.1947737336158752, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.4033205509185791, "step": 418 }, { "epoch": 0.02625, "grad_norm": 5.375, "grad_norm_var": 0.6346354166666667, "learning_rate": 0.0001, "loss": 10.7508, "loss/crossentropy": 2.1539926528930664, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.40337586402893066, "step": 420 }, { "epoch": 0.026375, "grad_norm": 6.75, "grad_norm_var": 0.7378743489583334, "learning_rate": 0.0001, "loss": 11.0461, "loss/crossentropy": 2.524567127227783, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4650000333786011, "step": 422 }, { "epoch": 0.0265, "grad_norm": 5.3125, "grad_norm_var": 0.6646443684895833, "learning_rate": 0.0001, "loss": 11.0872, "loss/crossentropy": 2.333670735359192, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.4344635456800461, "step": 424 }, { "epoch": 0.026625, "grad_norm": 5.3125, "grad_norm_var": 0.6597005208333333, "learning_rate": 0.0001, "loss": 10.8727, "loss/crossentropy": 2.4718040227890015, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.42122460901737213, "step": 426 }, { "epoch": 0.02675, "grad_norm": 4.75, "grad_norm_var": 0.67672119140625, "learning_rate": 0.0001, "loss": 10.7434, "loss/crossentropy": 2.60198974609375, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.41160446405410767, "step": 428 }, { "epoch": 0.026875, "grad_norm": 5.0625, "grad_norm_var": 0.20933837890625, "learning_rate": 0.0001, "loss": 10.79, "loss/crossentropy": 2.614295244216919, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4241054952144623, "step": 430 }, { "epoch": 0.027, "grad_norm": 5.65625, "grad_norm_var": 0.202587890625, "learning_rate": 0.0001, "loss": 10.7823, "loss/crossentropy": 2.578159213066101, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.40188899636268616, "step": 432 }, { "epoch": 0.027125, "grad_norm": 5.09375, "grad_norm_var": 0.2130859375, "learning_rate": 0.0001, "loss": 10.5528, "loss/crossentropy": 2.068563759326935, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.4180772006511688, "step": 434 }, { "epoch": 0.02725, "grad_norm": 6.0, "grad_norm_var": 0.23834228515625, "learning_rate": 0.0001, "loss": 10.6453, "loss/crossentropy": 2.596070408821106, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4149198830127716, "step": 436 }, { "epoch": 0.027375, "grad_norm": 4.75, "grad_norm_var": 0.151025390625, "learning_rate": 0.0001, "loss": 10.5426, "loss/crossentropy": 2.305434226989746, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.384622722864151, "step": 438 }, { "epoch": 0.0275, "grad_norm": 4.96875, "grad_norm_var": 0.1396484375, "learning_rate": 0.0001, "loss": 10.7981, "loss/crossentropy": 2.502975344657898, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.37728238105773926, "step": 440 }, { "epoch": 0.027625, "grad_norm": 5.25, "grad_norm_var": 0.142822265625, "learning_rate": 0.0001, "loss": 10.4751, "loss/crossentropy": 2.305208921432495, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.38094912469387054, "step": 442 }, { "epoch": 0.02775, "grad_norm": 5.625, "grad_norm_var": 0.13279622395833332, "learning_rate": 0.0001, "loss": 10.7068, "loss/crossentropy": 2.6903436183929443, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3932228535413742, "step": 444 }, { "epoch": 0.027875, "grad_norm": 6.53125, "grad_norm_var": 0.229541015625, "learning_rate": 0.0001, "loss": 10.9159, "loss/crossentropy": 2.724982738494873, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.48244524002075195, "step": 446 }, { "epoch": 0.028, "grad_norm": 4.71875, "grad_norm_var": 0.26022135416666664, "learning_rate": 0.0001, "loss": 10.5689, "loss/crossentropy": 2.1659945249557495, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.38983820378780365, "step": 448 }, { "epoch": 0.028125, "grad_norm": 6.09375, "grad_norm_var": 0.3133748372395833, "learning_rate": 0.0001, "loss": 10.4284, "loss/crossentropy": 2.4513895511627197, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.3891524076461792, "step": 450 }, { "epoch": 0.02825, "grad_norm": 5.28125, "grad_norm_var": 0.28541666666666665, "learning_rate": 0.0001, "loss": 10.6392, "loss/crossentropy": 2.3762803077697754, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.4033055752515793, "step": 452 }, { "epoch": 0.028375, "grad_norm": 6.625, "grad_norm_var": 0.87955322265625, "learning_rate": 0.0001, "loss": 10.9979, "loss/crossentropy": 2.5074435472488403, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4134407788515091, "step": 454 }, { "epoch": 0.0285, "grad_norm": 5.3125, "grad_norm_var": 0.85484619140625, "learning_rate": 0.0001, "loss": 10.7103, "loss/crossentropy": 2.4922858476638794, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.40840162336826324, "step": 456 }, { "epoch": 0.028625, "grad_norm": 4.96875, "grad_norm_var": 0.9095662434895834, "learning_rate": 0.0001, "loss": 10.4182, "loss/crossentropy": 2.332331657409668, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.37381240725517273, "step": 458 }, { "epoch": 0.02875, "grad_norm": 4.8125, "grad_norm_var": 0.9465983072916667, "learning_rate": 0.0001, "loss": 10.5666, "loss/crossentropy": 2.6399567127227783, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.38797518610954285, "step": 460 }, { "epoch": 0.028875, "grad_norm": 5.3125, "grad_norm_var": 0.889697265625, "learning_rate": 0.0001, "loss": 10.5769, "loss/crossentropy": 2.302717089653015, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3878500908613205, "step": 462 }, { "epoch": 0.029, "grad_norm": 4.96875, "grad_norm_var": 0.8904256184895833, "learning_rate": 0.0001, "loss": 10.4066, "loss/crossentropy": 2.3778563737869263, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.38498103618621826, "step": 464 }, { "epoch": 0.029125, "grad_norm": 4.78125, "grad_norm_var": 0.854931640625, "learning_rate": 0.0001, "loss": 10.6144, "loss/crossentropy": 2.0649060010910034, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3695131242275238, "step": 466 }, { "epoch": 0.02925, "grad_norm": 4.90625, "grad_norm_var": 0.8793619791666667, "learning_rate": 0.0001, "loss": 10.7395, "loss/crossentropy": 2.8441067934036255, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3995439410209656, "step": 468 }, { "epoch": 0.029375, "grad_norm": 6.46875, "grad_norm_var": 0.209765625, "learning_rate": 0.0001, "loss": 10.6913, "loss/crossentropy": 2.557037353515625, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.410599485039711, "step": 470 }, { "epoch": 0.0295, "grad_norm": 5.375, "grad_norm_var": 0.223828125, "learning_rate": 0.0001, "loss": 10.6722, "loss/crossentropy": 2.367073655128479, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3934127390384674, "step": 472 }, { "epoch": 0.029625, "grad_norm": 5.03125, "grad_norm_var": 0.21222330729166666, "learning_rate": 0.0001, "loss": 10.6489, "loss/crossentropy": 2.4897983074188232, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.42430558800697327, "step": 474 }, { "epoch": 0.02975, "grad_norm": 5.5, "grad_norm_var": 0.20976155598958332, "learning_rate": 0.0001, "loss": 10.6471, "loss/crossentropy": 2.4441006183624268, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.4049318730831146, "step": 476 }, { "epoch": 0.029875, "grad_norm": 4.46875, "grad_norm_var": 0.24075520833333333, "learning_rate": 0.0001, "loss": 10.9236, "loss/crossentropy": 2.4001717567443848, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3930549919605255, "step": 478 }, { "epoch": 0.03, "grad_norm": 4.9375, "grad_norm_var": 0.22499593098958334, "learning_rate": 0.0001, "loss": 10.4589, "loss/crossentropy": 2.5291205644607544, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.43080681562423706, "step": 480 }, { "epoch": 0.030125, "grad_norm": 6.40625, "grad_norm_var": 0.30774332682291666, "learning_rate": 0.0001, "loss": 10.4734, "loss/crossentropy": 2.388319194316864, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3897434026002884, "step": 482 }, { "epoch": 0.03025, "grad_norm": 5.5625, "grad_norm_var": 0.29670817057291665, "learning_rate": 0.0001, "loss": 10.4658, "loss/crossentropy": 2.2806296348571777, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3782193958759308, "step": 484 }, { "epoch": 0.030375, "grad_norm": 7.34375, "grad_norm_var": 1.065625, "learning_rate": 0.0001, "loss": 10.5857, "loss/crossentropy": 2.3721553087234497, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.4088495075702667, "step": 486 }, { "epoch": 0.0305, "grad_norm": 4.59375, "grad_norm_var": 1.071728515625, "learning_rate": 0.0001, "loss": 10.3529, "loss/crossentropy": 2.214341640472412, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.37406377494335175, "step": 488 }, { "epoch": 0.030625, "grad_norm": 5.0, "grad_norm_var": 1.05445556640625, "learning_rate": 0.0001, "loss": 10.5074, "loss/crossentropy": 2.5398401021957397, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.4116057902574539, "step": 490 }, { "epoch": 0.03075, "grad_norm": 5.28125, "grad_norm_var": 1.0507649739583333, "learning_rate": 0.0001, "loss": 10.5333, "loss/crossentropy": 2.6637160778045654, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.40726278722286224, "step": 492 }, { "epoch": 0.030875, "grad_norm": 4.625, "grad_norm_var": 1.0790974934895834, "learning_rate": 0.0001, "loss": 10.3532, "loss/crossentropy": 2.311811923980713, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.3635952025651932, "step": 494 }, { "epoch": 0.031, "grad_norm": 5.4375, "grad_norm_var": 1.1916300455729167, "learning_rate": 0.0001, "loss": 10.7405, "loss/crossentropy": 2.5883506536483765, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.4075654149055481, "step": 496 }, { "epoch": 0.031125, "grad_norm": 7.46875, "grad_norm_var": 1.3474894205729167, "learning_rate": 0.0001, "loss": 10.7376, "loss/crossentropy": 2.5056556463241577, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.4202713221311569, "step": 498 }, { "epoch": 0.03125, "grad_norm": 5.53125, "grad_norm_var": 1.3410807291666667, "learning_rate": 0.0001, "loss": 10.6792, "loss/crossentropy": 2.5138707160949707, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.44250351190567017, "step": 500 }, { "epoch": 0.031375, "grad_norm": 4.84375, "grad_norm_var": 0.64498291015625, "learning_rate": 0.0001, "loss": 10.5416, "loss/crossentropy": 2.6356310844421387, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.40963026881217957, "step": 502 }, { "epoch": 0.0315, "grad_norm": 4.40625, "grad_norm_var": 0.8839152018229167, "learning_rate": 0.0001, "loss": 10.5626, "loss/crossentropy": 2.4822793006896973, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.3620312958955765, "step": 504 }, { "epoch": 0.031625, "grad_norm": 5.40625, "grad_norm_var": 0.86373291015625, "learning_rate": 0.0001, "loss": 10.5343, "loss/crossentropy": 2.581249713897705, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.39074860513210297, "step": 506 }, { "epoch": 0.03175, "grad_norm": 4.65625, "grad_norm_var": 0.991259765625, "learning_rate": 0.0001, "loss": 10.3297, "loss/crossentropy": 2.4520576000213623, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.38373272120952606, "step": 508 }, { "epoch": 0.031875, "grad_norm": 5.03125, "grad_norm_var": 0.913134765625, "learning_rate": 0.0001, "loss": 10.6075, "loss/crossentropy": 2.5660005807876587, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.44344601035118103, "step": 510 }, { "epoch": 0.032, "grad_norm": 4.90625, "grad_norm_var": 0.7636027018229167, "learning_rate": 0.0001, "loss": 10.4046, "loss/crossentropy": 2.527319550514221, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.37920865416526794, "step": 512 }, { "epoch": 0.032125, "grad_norm": 4.5, "grad_norm_var": 0.46835530598958336, "learning_rate": 0.0001, "loss": 10.0486, "loss/crossentropy": 2.509230613708496, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.3975517302751541, "step": 514 }, { "epoch": 0.03225, "grad_norm": 4.78125, "grad_norm_var": 0.45974934895833336, "learning_rate": 0.0001, "loss": 10.3748, "loss/crossentropy": 2.296907901763916, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.37533947825431824, "step": 516 }, { "epoch": 0.032375, "grad_norm": 5.21875, "grad_norm_var": 0.45927327473958335, "learning_rate": 0.0001, "loss": 10.661, "loss/crossentropy": 2.587345600128174, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.44121941924095154, "step": 518 }, { "epoch": 0.0325, "grad_norm": 5.25, "grad_norm_var": 0.121337890625, "learning_rate": 0.0001, "loss": 10.3292, "loss/crossentropy": 2.4897454977035522, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.37143947184085846, "step": 520 }, { "epoch": 0.032625, "grad_norm": 5.3125, "grad_norm_var": 0.121337890625, "learning_rate": 0.0001, "loss": 10.5275, "loss/crossentropy": 2.3746429681777954, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.36811254918575287, "step": 522 }, { "epoch": 0.03275, "grad_norm": 4.90625, "grad_norm_var": 0.09269205729166667, "learning_rate": 0.0001, "loss": 10.4857, "loss/crossentropy": 2.516595959663391, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3954416811466217, "step": 524 }, { "epoch": 0.032875, "grad_norm": 4.59375, "grad_norm_var": 0.12864176432291666, "learning_rate": 0.0001, "loss": 10.4063, "loss/crossentropy": 2.1858848333358765, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.3671702444553375, "step": 526 }, { "epoch": 0.033, "grad_norm": 4.71875, "grad_norm_var": 0.12877604166666667, "learning_rate": 0.0001, "loss": 10.4706, "loss/crossentropy": 2.3672547340393066, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.42084239423274994, "step": 528 }, { "epoch": 0.033125, "grad_norm": 5.375, "grad_norm_var": 0.15623372395833332, "learning_rate": 0.0001, "loss": 10.5682, "loss/crossentropy": 2.4755560159683228, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.38027940690517426, "step": 530 }, { "epoch": 0.03325, "grad_norm": 4.84375, "grad_norm_var": 0.16353759765625, "learning_rate": 0.0001, "loss": 10.3109, "loss/crossentropy": 2.4052809476852417, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.37095922231674194, "step": 532 }, { "epoch": 0.033375, "grad_norm": 5.5625, "grad_norm_var": 0.185546875, "learning_rate": 0.0001, "loss": 10.6675, "loss/crossentropy": 2.7321990728378296, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.40198850631713867, "step": 534 }, { "epoch": 0.0335, "grad_norm": 4.5, "grad_norm_var": 0.24308268229166666, "learning_rate": 0.0001, "loss": 10.4618, "loss/crossentropy": 2.609284520149231, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.3769884407520294, "step": 536 }, { "epoch": 0.033625, "grad_norm": 4.625, "grad_norm_var": 0.229931640625, "learning_rate": 0.0001, "loss": 10.2295, "loss/crossentropy": 2.4272639751434326, "loss/hidden": 4.0390625, "loss/jsd": 0.0, "loss/logits": 0.38766802847385406, "step": 538 }, { "epoch": 0.03375, "grad_norm": 4.90625, "grad_norm_var": 0.22877604166666668, "learning_rate": 0.0001, "loss": 10.3895, "loss/crossentropy": 2.3962435722351074, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3932725340127945, "step": 540 }, { "epoch": 0.033875, "grad_norm": 5.21875, "grad_norm_var": 0.20539957682291668, "learning_rate": 0.0001, "loss": 10.1794, "loss/crossentropy": 2.4086058139801025, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.37670063972473145, "step": 542 }, { "epoch": 0.034, "grad_norm": 4.90625, "grad_norm_var": 0.21051025390625, "learning_rate": 0.0001, "loss": 10.5333, "loss/crossentropy": 2.406098246574402, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.3975420147180557, "step": 544 }, { "epoch": 0.034125, "grad_norm": 4.46875, "grad_norm_var": 0.19163004557291666, "learning_rate": 0.0001, "loss": 10.4258, "loss/crossentropy": 2.6447004079818726, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.40193626284599304, "step": 546 }, { "epoch": 0.03425, "grad_norm": 5.09375, "grad_norm_var": 0.17701416015625, "learning_rate": 0.0001, "loss": 10.5984, "loss/crossentropy": 2.570632576942444, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.4150645583868027, "step": 548 }, { "epoch": 0.034375, "grad_norm": 5.1875, "grad_norm_var": 0.15758056640625, "learning_rate": 0.0001, "loss": 10.1492, "loss/crossentropy": 2.250994086265564, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.37618446350097656, "step": 550 }, { "epoch": 0.0345, "grad_norm": 4.59375, "grad_norm_var": 0.09928385416666667, "learning_rate": 0.0001, "loss": 10.2205, "loss/crossentropy": 2.2985726594924927, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.38329558074474335, "step": 552 }, { "epoch": 0.034625, "grad_norm": 5.03125, "grad_norm_var": 0.084765625, "learning_rate": 0.0001, "loss": 10.2594, "loss/crossentropy": 2.409608244895935, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3573233038187027, "step": 554 }, { "epoch": 0.03475, "grad_norm": 5.0, "grad_norm_var": 0.08723958333333333, "learning_rate": 0.0001, "loss": 10.4628, "loss/crossentropy": 2.2365437746047974, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.36603498458862305, "step": 556 }, { "epoch": 0.034875, "grad_norm": 5.28125, "grad_norm_var": 0.107275390625, "learning_rate": 0.0001, "loss": 10.4873, "loss/crossentropy": 2.30200457572937, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.38180042803287506, "step": 558 }, { "epoch": 0.035, "grad_norm": 4.78125, "grad_norm_var": 0.156494140625, "learning_rate": 0.0001, "loss": 10.5403, "loss/crossentropy": 2.4553037881851196, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.3985973298549652, "step": 560 }, { "epoch": 0.035125, "grad_norm": 4.875, "grad_norm_var": 0.15087483723958334, "learning_rate": 0.0001, "loss": 10.4663, "loss/crossentropy": 2.484908103942871, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.39745810627937317, "step": 562 }, { "epoch": 0.03525, "grad_norm": 4.65625, "grad_norm_var": 0.15787760416666666, "learning_rate": 0.0001, "loss": 10.2568, "loss/crossentropy": 2.386349678039551, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3671696186065674, "step": 564 }, { "epoch": 0.035375, "grad_norm": 4.5, "grad_norm_var": 0.15891520182291666, "learning_rate": 0.0001, "loss": 10.2005, "loss/crossentropy": 2.796995162963867, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3951940983533859, "step": 566 }, { "epoch": 0.0355, "grad_norm": 4.625, "grad_norm_var": 0.14308268229166668, "learning_rate": 0.0001, "loss": 10.3384, "loss/crossentropy": 2.167185425758362, "loss/hidden": 3.9921875, "loss/jsd": 0.0, "loss/logits": 0.3679066449403763, "step": 568 }, { "epoch": 0.035625, "grad_norm": 4.3125, "grad_norm_var": 0.16002197265625, "learning_rate": 0.0001, "loss": 10.1328, "loss/crossentropy": 2.331982374191284, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.35966630280017853, "step": 570 }, { "epoch": 0.03575, "grad_norm": 4.9375, "grad_norm_var": 0.16490885416666667, "learning_rate": 0.0001, "loss": 10.2677, "loss/crossentropy": 2.545064091682434, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3585609495639801, "step": 572 }, { "epoch": 0.035875, "grad_norm": 4.5, "grad_norm_var": 0.13590087890625, "learning_rate": 0.0001, "loss": 10.1833, "loss/crossentropy": 2.673481822013855, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.3861938863992691, "step": 574 }, { "epoch": 0.036, "grad_norm": 4.40625, "grad_norm_var": 0.06877848307291666, "learning_rate": 0.0001, "loss": 9.8329, "loss/crossentropy": 2.2054319381713867, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.34671929478645325, "step": 576 }, { "epoch": 0.036125, "grad_norm": 4.40625, "grad_norm_var": 0.04698893229166667, "learning_rate": 0.0001, "loss": 9.8525, "loss/crossentropy": 2.3083781003952026, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3572017103433609, "step": 578 }, { "epoch": 0.03625, "grad_norm": 4.84375, "grad_norm_var": 0.12235921223958333, "learning_rate": 0.0001, "loss": 10.1868, "loss/crossentropy": 2.5364460945129395, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.35016657412052155, "step": 580 }, { "epoch": 0.036375, "grad_norm": 4.6875, "grad_norm_var": 0.11933186848958334, "learning_rate": 0.0001, "loss": 10.1112, "loss/crossentropy": 2.1111658811569214, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.35201945900917053, "step": 582 }, { "epoch": 0.0365, "grad_norm": 4.28125, "grad_norm_var": 0.13157145182291666, "learning_rate": 0.0001, "loss": 10.0743, "loss/crossentropy": 2.2730716466903687, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.3673980087041855, "step": 584 }, { "epoch": 0.036625, "grad_norm": 4.4375, "grad_norm_var": 0.11443684895833334, "learning_rate": 0.0001, "loss": 10.1844, "loss/crossentropy": 2.423087477684021, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.3773965388536453, "step": 586 }, { "epoch": 0.03675, "grad_norm": 5.09375, "grad_norm_var": 0.11886393229166667, "learning_rate": 0.0001, "loss": 9.9989, "loss/crossentropy": 2.429610013961792, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.37359054386615753, "step": 588 }, { "epoch": 0.036875, "grad_norm": 4.34375, "grad_norm_var": 0.15779622395833334, "learning_rate": 0.0001, "loss": 10.2243, "loss/crossentropy": 2.705647587776184, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.418183371424675, "step": 590 }, { "epoch": 0.037, "grad_norm": 4.375, "grad_norm_var": 0.16418863932291666, "learning_rate": 0.0001, "loss": 10.3141, "loss/crossentropy": 2.5757263898849487, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.38670916855335236, "step": 592 }, { "epoch": 0.037125, "grad_norm": 4.0, "grad_norm_var": 0.20058186848958334, "learning_rate": 0.0001, "loss": 9.958, "loss/crossentropy": 2.660887598991394, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3619972914457321, "step": 594 }, { "epoch": 0.03725, "grad_norm": 4.34375, "grad_norm_var": 0.11832275390625, "learning_rate": 0.0001, "loss": 10.0076, "loss/crossentropy": 2.2089375257492065, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.37448957562446594, "step": 596 }, { "epoch": 0.037375, "grad_norm": 4.9375, "grad_norm_var": 0.1216796875, "learning_rate": 0.0001, "loss": 10.307, "loss/crossentropy": 2.52751886844635, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3705534487962723, "step": 598 }, { "epoch": 0.0375, "grad_norm": 5.25, "grad_norm_var": 0.14667561848958333, "learning_rate": 0.0001, "loss": 9.986, "loss/crossentropy": 1.9802033305168152, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.34747298061847687, "step": 600 }, { "epoch": 0.037625, "grad_norm": 4.90625, "grad_norm_var": 0.16565348307291666, "learning_rate": 0.0001, "loss": 9.8585, "loss/crossentropy": 2.3861488103866577, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3828616738319397, "step": 602 }, { "epoch": 0.03775, "grad_norm": 17.125, "grad_norm_var": 9.949593098958333, "learning_rate": 0.0001, "loss": 10.9595, "loss/crossentropy": 2.5604605674743652, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3666844367980957, "step": 604 }, { "epoch": 0.037875, "grad_norm": 5.15625, "grad_norm_var": 9.885091145833334, "learning_rate": 0.0001, "loss": 9.9475, "loss/crossentropy": 2.2230526208877563, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.3571869730949402, "step": 606 }, { "epoch": 0.038, "grad_norm": 4.53125, "grad_norm_var": 9.79449462890625, "learning_rate": 0.0001, "loss": 10.3505, "loss/crossentropy": 2.500959277153015, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.37224216759204865, "step": 608 }, { "epoch": 0.038125, "grad_norm": 4.875, "grad_norm_var": 9.682796223958333, "learning_rate": 0.0001, "loss": 9.9394, "loss/crossentropy": 2.41185462474823, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3980567008256912, "step": 610 }, { "epoch": 0.03825, "grad_norm": 4.65625, "grad_norm_var": 9.60631103515625, "learning_rate": 0.0001, "loss": 10.2283, "loss/crossentropy": 1.9943309426307678, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.3476145565509796, "step": 612 }, { "epoch": 0.038375, "grad_norm": 4.9375, "grad_norm_var": 9.56656494140625, "learning_rate": 0.0001, "loss": 10.4794, "loss/crossentropy": 2.747882127761841, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.4243037551641464, "step": 614 }, { "epoch": 0.0385, "grad_norm": 5.4375, "grad_norm_var": 9.51558837890625, "learning_rate": 0.0001, "loss": 10.3842, "loss/crossentropy": 2.56972599029541, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.4048524647951126, "step": 616 }, { "epoch": 0.038625, "grad_norm": 4.40625, "grad_norm_var": 9.47301025390625, "learning_rate": 0.0001, "loss": 9.9875, "loss/crossentropy": 2.5478017330169678, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.36699262261390686, "step": 618 }, { "epoch": 0.03875, "grad_norm": 4.3125, "grad_norm_var": 0.20136311848958333, "learning_rate": 0.0001, "loss": 9.9678, "loss/crossentropy": 2.3284069299697876, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3594726324081421, "step": 620 }, { "epoch": 0.038875, "grad_norm": 4.5625, "grad_norm_var": 0.15087483723958334, "learning_rate": 0.0001, "loss": 10.0477, "loss/crossentropy": 2.486843466758728, "loss/hidden": 3.9921875, "loss/jsd": 0.0, "loss/logits": 0.3659539967775345, "step": 622 }, { "epoch": 0.039, "grad_norm": 5.03125, "grad_norm_var": 0.15201416015625, "learning_rate": 0.0001, "loss": 10.1201, "loss/crossentropy": 2.581569790840149, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.3597100079059601, "step": 624 }, { "epoch": 0.039125, "grad_norm": 4.40625, "grad_norm_var": 0.13865559895833332, "learning_rate": 0.0001, "loss": 10.007, "loss/crossentropy": 2.284889340400696, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3608839064836502, "step": 626 }, { "epoch": 0.03925, "grad_norm": 4.71875, "grad_norm_var": 0.13800455729166666, "learning_rate": 0.0001, "loss": 10.2784, "loss/crossentropy": 2.603570342063904, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3870154470205307, "step": 628 }, { "epoch": 0.039375, "grad_norm": 4.15625, "grad_norm_var": 0.14263916015625, "learning_rate": 0.0001, "loss": 9.9924, "loss/crossentropy": 2.538639187812805, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.36513669788837433, "step": 630 }, { "epoch": 0.0395, "grad_norm": 4.3125, "grad_norm_var": 0.07198893229166667, "learning_rate": 0.0001, "loss": 9.8024, "loss/crossentropy": 2.3725547790527344, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3675404489040375, "step": 632 }, { "epoch": 0.039625, "grad_norm": 4.9375, "grad_norm_var": 0.09101155598958334, "learning_rate": 0.0001, "loss": 10.1525, "loss/crossentropy": 2.455158233642578, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.39550477266311646, "step": 634 }, { "epoch": 0.03975, "grad_norm": 4.8125, "grad_norm_var": 0.08709309895833334, "learning_rate": 0.0001, "loss": 9.9672, "loss/crossentropy": 2.4096927642822266, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.37770088016986847, "step": 636 }, { "epoch": 0.039875, "grad_norm": 4.1875, "grad_norm_var": 0.09761962890625, "learning_rate": 0.0001, "loss": 10.0311, "loss/crossentropy": 2.3568464517593384, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.3562964200973511, "step": 638 }, { "epoch": 0.04, "grad_norm": 4.34375, "grad_norm_var": 0.09308268229166666, "learning_rate": 0.0001, "loss": 9.9439, "loss/crossentropy": 2.312902569770813, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.38113781809806824, "step": 640 }, { "epoch": 0.040125, "grad_norm": 4.34375, "grad_norm_var": 0.09225260416666667, "learning_rate": 0.0001, "loss": 9.8767, "loss/crossentropy": 2.607424736022949, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.37369397282600403, "step": 642 }, { "epoch": 0.04025, "grad_norm": 6.84375, "grad_norm_var": 0.45631103515625, "learning_rate": 0.0001, "loss": 10.4975, "loss/crossentropy": 2.197988510131836, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3591903746128082, "step": 644 }, { "epoch": 0.040375, "grad_norm": 4.9375, "grad_norm_var": 0.45435791015625, "learning_rate": 0.0001, "loss": 10.0203, "loss/crossentropy": 2.617353677749634, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.38244011998176575, "step": 646 }, { "epoch": 0.0405, "grad_norm": 5.25, "grad_norm_var": 0.43922119140625, "learning_rate": 0.0001, "loss": 9.9007, "loss/crossentropy": 2.3076788187026978, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3668941855430603, "step": 648 }, { "epoch": 0.040625, "grad_norm": 4.59375, "grad_norm_var": 0.437744140625, "learning_rate": 0.0001, "loss": 10.1593, "loss/crossentropy": 2.3939337730407715, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3384999781847, "step": 650 }, { "epoch": 0.04075, "grad_norm": 4.25, "grad_norm_var": 0.45924072265625, "learning_rate": 0.0001, "loss": 9.905, "loss/crossentropy": 2.3927820920944214, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.35453103482723236, "step": 652 }, { "epoch": 0.040875, "grad_norm": 4.25, "grad_norm_var": 0.4364217122395833, "learning_rate": 0.0001, "loss": 10.0505, "loss/crossentropy": 2.327817440032959, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.35317686200141907, "step": 654 }, { "epoch": 0.041, "grad_norm": 4.28125, "grad_norm_var": 0.4278483072916667, "learning_rate": 0.0001, "loss": 10.0239, "loss/crossentropy": 2.6120766401290894, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3679347038269043, "step": 656 }, { "epoch": 0.041125, "grad_norm": 5.15625, "grad_norm_var": 0.41848958333333336, "learning_rate": 0.0001, "loss": 10.1713, "loss/crossentropy": 2.545218348503113, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.37261858582496643, "step": 658 }, { "epoch": 0.04125, "grad_norm": 5.8125, "grad_norm_var": 0.5816243489583334, "learning_rate": 0.0001, "loss": 10.3016, "loss/crossentropy": 2.606196641921997, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.39967936277389526, "step": 660 }, { "epoch": 0.041375, "grad_norm": 4.15625, "grad_norm_var": 0.5972615559895833, "learning_rate": 0.0001, "loss": 9.9053, "loss/crossentropy": 2.30068039894104, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3504374176263809, "step": 662 }, { "epoch": 0.0415, "grad_norm": 7.1875, "grad_norm_var": 0.9585774739583334, "learning_rate": 0.0001, "loss": 10.2778, "loss/crossentropy": 2.5713064670562744, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.43116797506809235, "step": 664 }, { "epoch": 0.041625, "grad_norm": 5.6875, "grad_norm_var": 0.9890462239583333, "learning_rate": 0.0001, "loss": 10.1834, "loss/crossentropy": 2.442265272140503, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.36182868480682373, "step": 666 }, { "epoch": 0.04175, "grad_norm": 4.375, "grad_norm_var": 0.9586222330729167, "learning_rate": 0.0001, "loss": 10.064, "loss/crossentropy": 2.4865576028823853, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.37539851665496826, "step": 668 }, { "epoch": 0.041875, "grad_norm": 4.375, "grad_norm_var": 0.97222900390625, "learning_rate": 0.0001, "loss": 10.1095, "loss/crossentropy": 2.382893681526184, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3759250193834305, "step": 670 }, { "epoch": 0.042, "grad_norm": 4.46875, "grad_norm_var": 0.96640625, "learning_rate": 0.0001, "loss": 10.0076, "loss/crossentropy": 2.407153367996216, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3480299711227417, "step": 672 }, { "epoch": 0.042125, "grad_norm": 4.40625, "grad_norm_var": 1.0254557291666666, "learning_rate": 0.0001, "loss": 10.209, "loss/crossentropy": 2.239955425262451, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.38244734704494476, "step": 674 }, { "epoch": 0.04225, "grad_norm": 5.75, "grad_norm_var": 0.6648722330729167, "learning_rate": 0.0001, "loss": 10.3096, "loss/crossentropy": 2.3780544996261597, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.40292245149612427, "step": 676 }, { "epoch": 0.042375, "grad_norm": 4.375, "grad_norm_var": 0.6613240559895833, "learning_rate": 0.0001, "loss": 9.7608, "loss/crossentropy": 2.2625954151153564, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3428070992231369, "step": 678 }, { "epoch": 0.0425, "grad_norm": 5.03125, "grad_norm_var": 0.2589803059895833, "learning_rate": 0.0001, "loss": 9.8559, "loss/crossentropy": 2.404393434524536, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.35663238167762756, "step": 680 }, { "epoch": 0.042625, "grad_norm": 4.625, "grad_norm_var": 0.146728515625, "learning_rate": 0.0001, "loss": 10.0169, "loss/crossentropy": 2.354637026786804, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.38327428698539734, "step": 682 }, { "epoch": 0.04275, "grad_norm": 5.5, "grad_norm_var": 0.20545247395833333, "learning_rate": 0.0001, "loss": 10.3256, "loss/crossentropy": 2.3648595809936523, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3722570538520813, "step": 684 }, { "epoch": 0.042875, "grad_norm": 11.625, "grad_norm_var": 3.2235677083333334, "learning_rate": 0.0001, "loss": 10.0052, "loss/crossentropy": 2.255233407020569, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.3729362040758133, "step": 686 }, { "epoch": 0.043, "grad_norm": 6.75, "grad_norm_var": 3.73131103515625, "learning_rate": 0.0001, "loss": 10.9058, "loss/crossentropy": 2.6284834146499634, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.40316125750541687, "step": 688 }, { "epoch": 0.043125, "grad_norm": 4.84375, "grad_norm_var": 3.603706868489583, "learning_rate": 0.0001, "loss": 10.2674, "loss/crossentropy": 2.111253321170807, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.33769945800304413, "step": 690 }, { "epoch": 0.04325, "grad_norm": 4.3125, "grad_norm_var": 3.69654541015625, "learning_rate": 0.0001, "loss": 9.9638, "loss/crossentropy": 2.461041808128357, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3787677586078644, "step": 692 }, { "epoch": 0.043375, "grad_norm": 5.0, "grad_norm_var": 3.5995442708333334, "learning_rate": 0.0001, "loss": 10.4949, "loss/crossentropy": 2.615231990814209, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.4022253602743149, "step": 694 }, { "epoch": 0.0435, "grad_norm": 4.15625, "grad_norm_var": 3.6969889322916667, "learning_rate": 0.0001, "loss": 9.9086, "loss/crossentropy": 2.005247116088867, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.325066015124321, "step": 696 }, { "epoch": 0.043625, "grad_norm": 4.59375, "grad_norm_var": 3.723921712239583, "learning_rate": 0.0001, "loss": 9.6101, "loss/crossentropy": 2.4810917377471924, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3531768321990967, "step": 698 }, { "epoch": 0.04375, "grad_norm": 5.34375, "grad_norm_var": 3.715104166666667, "learning_rate": 0.0001, "loss": 9.931, "loss/crossentropy": 2.4594435691833496, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3658078759908676, "step": 700 }, { "epoch": 0.043875, "grad_norm": 4.21875, "grad_norm_var": 1.0486979166666666, "learning_rate": 0.0001, "loss": 9.831, "loss/crossentropy": 2.4843145608901978, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3550899028778076, "step": 702 }, { "epoch": 0.044, "grad_norm": 4.53125, "grad_norm_var": 0.1243560791015625, "learning_rate": 0.0001, "loss": 9.8716, "loss/crossentropy": 2.630019426345825, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.3656453341245651, "step": 704 }, { "epoch": 0.044125, "grad_norm": 4.71875, "grad_norm_var": 0.12111714680989584, "learning_rate": 0.0001, "loss": 10.0534, "loss/crossentropy": 2.42562735080719, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3582864999771118, "step": 706 }, { "epoch": 0.04425, "grad_norm": 4.21875, "grad_norm_var": 0.13979390462239583, "learning_rate": 0.0001, "loss": 9.9731, "loss/crossentropy": 2.4668819904327393, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.40261900424957275, "step": 708 }, { "epoch": 0.044375, "grad_norm": 4.34375, "grad_norm_var": 0.12498270670572917, "learning_rate": 0.0001, "loss": 9.74, "loss/crossentropy": 2.565619111061096, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3622848391532898, "step": 710 }, { "epoch": 0.0445, "grad_norm": 4.8125, "grad_norm_var": 0.1250396728515625, "learning_rate": 0.0001, "loss": 9.886, "loss/crossentropy": 2.53997802734375, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.36765336990356445, "step": 712 }, { "epoch": 0.044625, "grad_norm": 4.3125, "grad_norm_var": 0.13161519368489583, "learning_rate": 0.0001, "loss": 9.7684, "loss/crossentropy": 2.4978381395339966, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.33836664259433746, "step": 714 }, { "epoch": 0.04475, "grad_norm": 4.125, "grad_norm_var": 0.08456929524739583, "learning_rate": 0.0001, "loss": 9.78, "loss/crossentropy": 2.5785136222839355, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.36389264464378357, "step": 716 }, { "epoch": 0.044875, "grad_norm": 4.5, "grad_norm_var": 0.0640289306640625, "learning_rate": 0.0001, "loss": 9.7739, "loss/crossentropy": 2.398101568222046, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.36110346019268036, "step": 718 }, { "epoch": 0.045, "grad_norm": 3.90625, "grad_norm_var": 0.07908426920572917, "learning_rate": 0.0001, "loss": 9.7225, "loss/crossentropy": 2.591996669769287, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.36347195506095886, "step": 720 }, { "epoch": 0.045125, "grad_norm": 4.28125, "grad_norm_var": 0.0589752197265625, "learning_rate": 0.0001, "loss": 9.7784, "loss/crossentropy": 2.527889609336853, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.36425699293613434, "step": 722 }, { "epoch": 0.04525, "grad_norm": 4.125, "grad_norm_var": 0.06559244791666667, "learning_rate": 0.0001, "loss": 9.2434, "loss/crossentropy": 2.3371061086654663, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3342244625091553, "step": 724 }, { "epoch": 0.045375, "grad_norm": 4.21875, "grad_norm_var": 0.06256103515625, "learning_rate": 0.0001, "loss": 9.7158, "loss/crossentropy": 2.479300379753113, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.362941637635231, "step": 726 }, { "epoch": 0.0455, "grad_norm": 4.15625, "grad_norm_var": 0.04332275390625, "learning_rate": 0.0001, "loss": 9.7492, "loss/crossentropy": 2.529498338699341, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.35392117500305176, "step": 728 }, { "epoch": 0.045625, "grad_norm": 5.21875, "grad_norm_var": 0.11004231770833334, "learning_rate": 0.0001, "loss": 10.1377, "loss/crossentropy": 2.6024062633514404, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.375337615609169, "step": 730 }, { "epoch": 0.04575, "grad_norm": 4.15625, "grad_norm_var": 0.10641276041666667, "learning_rate": 0.0001, "loss": 9.4994, "loss/crossentropy": 2.29757559299469, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.32550813257694244, "step": 732 }, { "epoch": 0.045875, "grad_norm": 4.125, "grad_norm_var": 0.10484619140625, "learning_rate": 0.0001, "loss": 9.6179, "loss/crossentropy": 2.2457196712493896, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3559332340955734, "step": 734 }, { "epoch": 0.046, "grad_norm": 4.9375, "grad_norm_var": 0.17092997233072918, "learning_rate": 0.0001, "loss": 9.7065, "loss/crossentropy": 2.2584201097488403, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3422286808490753, "step": 736 }, { "epoch": 0.046125, "grad_norm": 3.96875, "grad_norm_var": 0.18339742024739583, "learning_rate": 0.0001, "loss": 9.7707, "loss/crossentropy": 2.2206841707229614, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3323328047990799, "step": 738 }, { "epoch": 0.04625, "grad_norm": 4.15625, "grad_norm_var": 0.18538004557291668, "learning_rate": 0.0001, "loss": 9.668, "loss/crossentropy": 2.2953317165374756, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3379151374101639, "step": 740 }, { "epoch": 0.046375, "grad_norm": 4.4375, "grad_norm_var": 0.18704020182291667, "learning_rate": 0.0001, "loss": 9.8679, "loss/crossentropy": 2.5212793350219727, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3425360172986984, "step": 742 }, { "epoch": 0.0465, "grad_norm": 4.5, "grad_norm_var": 0.18088785807291666, "learning_rate": 0.0001, "loss": 9.8863, "loss/crossentropy": 2.3973162174224854, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3348172605037689, "step": 744 }, { "epoch": 0.046625, "grad_norm": 4.34375, "grad_norm_var": 0.14060872395833332, "learning_rate": 0.0001, "loss": 9.5977, "loss/crossentropy": 2.435719132423401, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3557274043560028, "step": 746 }, { "epoch": 0.04675, "grad_norm": 4.09375, "grad_norm_var": 0.14296468098958334, "learning_rate": 0.0001, "loss": 9.6008, "loss/crossentropy": 2.349796175956726, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3267946243286133, "step": 748 }, { "epoch": 0.046875, "grad_norm": 4.34375, "grad_norm_var": 0.13899332682291668, "learning_rate": 0.0001, "loss": 9.7616, "loss/crossentropy": 2.2053170204162598, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.34219157695770264, "step": 750 }, { "epoch": 0.047, "grad_norm": 4.125, "grad_norm_var": 0.052718098958333334, "learning_rate": 0.0001, "loss": 9.7818, "loss/crossentropy": 2.4950714111328125, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.36529192328453064, "step": 752 }, { "epoch": 0.047125, "grad_norm": 4.125, "grad_norm_var": 0.03909098307291667, "learning_rate": 0.0001, "loss": 9.655, "loss/crossentropy": 2.20254647731781, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.31340985000133514, "step": 754 }, { "epoch": 0.04725, "grad_norm": 4.40625, "grad_norm_var": 0.03970947265625, "learning_rate": 0.0001, "loss": 9.4693, "loss/crossentropy": 2.384778141975403, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.33873017132282257, "step": 756 }, { "epoch": 0.047375, "grad_norm": 4.125, "grad_norm_var": 0.044709269205729166, "learning_rate": 0.0001, "loss": 9.6639, "loss/crossentropy": 2.4044910669326782, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3298284709453583, "step": 758 }, { "epoch": 0.0475, "grad_norm": 4.5625, "grad_norm_var": 0.05974019368489583, "learning_rate": 0.0001, "loss": 9.8048, "loss/crossentropy": 2.125362753868103, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.31744199991226196, "step": 760 }, { "epoch": 0.047625, "grad_norm": 4.25, "grad_norm_var": 0.05503641764322917, "learning_rate": 0.0001, "loss": 9.5929, "loss/crossentropy": 2.4581737518310547, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3449174612760544, "step": 762 }, { "epoch": 0.04775, "grad_norm": 4.1875, "grad_norm_var": 0.05503641764322917, "learning_rate": 0.0001, "loss": 9.5925, "loss/crossentropy": 2.319092273712158, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.33179551362991333, "step": 764 }, { "epoch": 0.047875, "grad_norm": 4.25, "grad_norm_var": 0.0488677978515625, "learning_rate": 0.0001, "loss": 9.7524, "loss/crossentropy": 2.6482656002044678, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3577173054218292, "step": 766 }, { "epoch": 0.048, "grad_norm": 3.765625, "grad_norm_var": 0.06024983723958333, "learning_rate": 0.0001, "loss": 9.6826, "loss/crossentropy": 2.3165204524993896, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.32748541235923767, "step": 768 }, { "epoch": 0.048125, "grad_norm": 4.375, "grad_norm_var": 0.058821614583333334, "learning_rate": 0.0001, "loss": 9.6115, "loss/crossentropy": 2.3324743509292603, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3622802048921585, "step": 770 }, { "epoch": 0.04825, "grad_norm": 4.0625, "grad_norm_var": 0.054911295572916664, "learning_rate": 0.0001, "loss": 9.7171, "loss/crossentropy": 2.293349862098694, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.341478168964386, "step": 772 }, { "epoch": 0.048375, "grad_norm": 3.8125, "grad_norm_var": 0.06122639973958333, "learning_rate": 0.0001, "loss": 9.5262, "loss/crossentropy": 2.6875650882720947, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.35754619538784027, "step": 774 }, { "epoch": 0.0485, "grad_norm": 4.15625, "grad_norm_var": 0.03291015625, "learning_rate": 0.0001, "loss": 9.6137, "loss/crossentropy": 2.319961667060852, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3247549384832382, "step": 776 }, { "epoch": 0.048625, "grad_norm": 4.5, "grad_norm_var": 0.049117024739583334, "learning_rate": 0.0001, "loss": 9.7145, "loss/crossentropy": 2.2820927500724792, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.36693014204502106, "step": 778 }, { "epoch": 0.04875, "grad_norm": 4.4375, "grad_norm_var": 0.054520670572916666, "learning_rate": 0.0001, "loss": 9.6572, "loss/crossentropy": 2.323164224624634, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3539418578147888, "step": 780 }, { "epoch": 0.048875, "grad_norm": 4.34375, "grad_norm_var": 0.05587565104166667, "learning_rate": 0.0001, "loss": 9.6451, "loss/crossentropy": 2.234144926071167, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3302062600851059, "step": 782 }, { "epoch": 0.049, "grad_norm": 4.15625, "grad_norm_var": 0.044530232747395836, "learning_rate": 0.0001, "loss": 9.6092, "loss/crossentropy": 2.0940393209457397, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.33372722566127777, "step": 784 }, { "epoch": 0.049125, "grad_norm": 3.796875, "grad_norm_var": 0.06750895182291666, "learning_rate": 0.0001, "loss": 9.4303, "loss/crossentropy": 2.165642499923706, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3112798035144806, "step": 786 }, { "epoch": 0.04925, "grad_norm": 4.875, "grad_norm_var": 0.09644775390625, "learning_rate": 0.0001, "loss": 9.8704, "loss/crossentropy": 2.4933313131332397, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3489539921283722, "step": 788 }, { "epoch": 0.049375, "grad_norm": 3.578125, "grad_norm_var": 0.10728759765625, "learning_rate": 0.0001, "loss": 9.4931, "loss/crossentropy": 2.1803754568099976, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3406267315149307, "step": 790 }, { "epoch": 0.0495, "grad_norm": 3.9375, "grad_norm_var": 0.11580301920572916, "learning_rate": 0.0001, "loss": 9.7055, "loss/crossentropy": 2.444434642791748, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.34211790561676025, "step": 792 }, { "epoch": 0.049625, "grad_norm": 4.78125, "grad_norm_var": 0.12771708170572918, "learning_rate": 0.0001, "loss": 9.6484, "loss/crossentropy": 2.041724741458893, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3167402148246765, "step": 794 }, { "epoch": 0.04975, "grad_norm": 4.0625, "grad_norm_var": 0.12382405598958333, "learning_rate": 0.0001, "loss": 9.616, "loss/crossentropy": 2.430485963821411, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.359543040394783, "step": 796 }, { "epoch": 0.049875, "grad_norm": 3.9375, "grad_norm_var": 0.22916666666666666, "learning_rate": 0.0001, "loss": 9.8588, "loss/crossentropy": 2.421727180480957, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3639049679040909, "step": 798 }, { "epoch": 0.05, "grad_norm": 3.9375, "grad_norm_var": 0.23111979166666666, "learning_rate": 0.0001, "loss": 9.5411, "loss/crossentropy": 2.173619508743286, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.34150390326976776, "step": 800 }, { "epoch": 0.050125, "grad_norm": 4.0, "grad_norm_var": 0.2822825113932292, "learning_rate": 0.0001, "loss": 9.6478, "loss/crossentropy": 2.5516271591186523, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.35007384419441223, "step": 802 }, { "epoch": 0.05025, "grad_norm": 4.4375, "grad_norm_var": 0.25654195149739584, "learning_rate": 0.0001, "loss": 9.6222, "loss/crossentropy": 2.5491143465042114, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3419859707355499, "step": 804 }, { "epoch": 0.050375, "grad_norm": 4.4375, "grad_norm_var": 0.23424072265625, "learning_rate": 0.0001, "loss": 9.6289, "loss/crossentropy": 2.5568002462387085, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.38842974603176117, "step": 806 }, { "epoch": 0.0505, "grad_norm": 4.03125, "grad_norm_var": 0.22262369791666667, "learning_rate": 0.0001, "loss": 9.5115, "loss/crossentropy": 2.328479051589966, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.33506618440151215, "step": 808 }, { "epoch": 0.050625, "grad_norm": 3.96875, "grad_norm_var": 0.21578369140625, "learning_rate": 0.0001, "loss": 9.6195, "loss/crossentropy": 2.2139497995376587, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3508079797029495, "step": 810 }, { "epoch": 0.05075, "grad_norm": 3.875, "grad_norm_var": 0.2201812744140625, "learning_rate": 0.0001, "loss": 9.3948, "loss/crossentropy": 2.3431121110916138, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.34683074057102203, "step": 812 }, { "epoch": 0.050875, "grad_norm": 4.4375, "grad_norm_var": 0.12883199055989583, "learning_rate": 0.0001, "loss": 9.9172, "loss/crossentropy": 2.356547713279724, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3437325358390808, "step": 814 }, { "epoch": 0.051, "grad_norm": 4.21875, "grad_norm_var": 0.1194732666015625, "learning_rate": 0.0001, "loss": 9.5043, "loss/crossentropy": 2.438585638999939, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.34822140634059906, "step": 816 }, { "epoch": 0.051125, "grad_norm": 4.125, "grad_norm_var": 0.04539286295572917, "learning_rate": 0.0001, "loss": 9.398, "loss/crossentropy": 2.232826828956604, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.33623330295085907, "step": 818 }, { "epoch": 0.05125, "grad_norm": 4.6875, "grad_norm_var": 0.05720113118489583, "learning_rate": 0.0001, "loss": 9.5585, "loss/crossentropy": 2.3401767015457153, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.33902715146541595, "step": 820 }, { "epoch": 0.051375, "grad_norm": 4.875, "grad_norm_var": 0.07072652180989583, "learning_rate": 0.0001, "loss": 9.4945, "loss/crossentropy": 2.1463793516159058, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3293427973985672, "step": 822 }, { "epoch": 0.0515, "grad_norm": 4.09375, "grad_norm_var": 0.06341145833333334, "learning_rate": 0.0001, "loss": 9.4966, "loss/crossentropy": 2.2964788675308228, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3311201483011246, "step": 824 }, { "epoch": 0.051625, "grad_norm": 4.46875, "grad_norm_var": 0.06575520833333333, "learning_rate": 0.0001, "loss": 9.6328, "loss/crossentropy": 2.4484344720840454, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3489266186952591, "step": 826 }, { "epoch": 0.05175, "grad_norm": 4.03125, "grad_norm_var": 0.054541015625, "learning_rate": 0.0001, "loss": 9.4653, "loss/crossentropy": 2.2709579467773438, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3134896159172058, "step": 828 }, { "epoch": 0.051875, "grad_norm": 3.8125, "grad_norm_var": 0.08080952962239583, "learning_rate": 0.0001, "loss": 9.5952, "loss/crossentropy": 2.3467541933059692, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.33979402482509613, "step": 830 }, { "epoch": 0.052, "grad_norm": 4.375, "grad_norm_var": 0.09925028483072916, "learning_rate": 0.0001, "loss": 9.5125, "loss/crossentropy": 2.3023515939712524, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.31126780807971954, "step": 832 }, { "epoch": 0.052125, "grad_norm": 4.0, "grad_norm_var": 0.10331929524739583, "learning_rate": 0.0001, "loss": 9.6691, "loss/crossentropy": 2.2762893438339233, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.33598124980926514, "step": 834 }, { "epoch": 0.05225, "grad_norm": 4.625, "grad_norm_var": 0.10900065104166666, "learning_rate": 0.0001, "loss": 9.6829, "loss/crossentropy": 2.4334222078323364, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3680105209350586, "step": 836 }, { "epoch": 0.052375, "grad_norm": 4.25, "grad_norm_var": 0.07909749348958334, "learning_rate": 0.0001, "loss": 9.585, "loss/crossentropy": 2.4400514364242554, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.34038496017456055, "step": 838 }, { "epoch": 0.0525, "grad_norm": 3.921875, "grad_norm_var": 0.08203837076822916, "learning_rate": 0.0001, "loss": 9.4398, "loss/crossentropy": 2.413126230239868, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.36697521805763245, "step": 840 }, { "epoch": 0.052625, "grad_norm": 4.3125, "grad_norm_var": 0.06610921223958334, "learning_rate": 0.0001, "loss": 9.9185, "loss/crossentropy": 2.547673225402832, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.36152128875255585, "step": 842 }, { "epoch": 0.05275, "grad_norm": 3.75, "grad_norm_var": 0.07534077962239584, "learning_rate": 0.0001, "loss": 9.3408, "loss/crossentropy": 2.469245195388794, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.34433713555336, "step": 844 }, { "epoch": 0.052875, "grad_norm": 4.53125, "grad_norm_var": 0.08121337890625, "learning_rate": 0.0001, "loss": 9.6554, "loss/crossentropy": 2.5020763874053955, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3375013768672943, "step": 846 }, { "epoch": 0.053, "grad_norm": 3.6875, "grad_norm_var": 0.082080078125, "learning_rate": 0.0001, "loss": 9.4903, "loss/crossentropy": 2.294095277786255, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.32213783264160156, "step": 848 }, { "epoch": 0.053125, "grad_norm": 3.953125, "grad_norm_var": 0.07913411458333333, "learning_rate": 0.0001, "loss": 9.4497, "loss/crossentropy": 2.2911940813064575, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.31579577922821045, "step": 850 }, { "epoch": 0.05325, "grad_norm": 4.46875, "grad_norm_var": 0.06796468098958333, "learning_rate": 0.0001, "loss": 9.4926, "loss/crossentropy": 2.4447981119155884, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.33244381844997406, "step": 852 }, { "epoch": 0.053375, "grad_norm": 3.96875, "grad_norm_var": 0.06682942708333334, "learning_rate": 0.0001, "loss": 9.2842, "loss/crossentropy": 2.3211770057678223, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.33892732858657837, "step": 854 }, { "epoch": 0.0535, "grad_norm": 3.96875, "grad_norm_var": 0.06518452962239583, "learning_rate": 0.0001, "loss": 9.7257, "loss/crossentropy": 2.362980604171753, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3292359262704849, "step": 856 }, { "epoch": 0.053625, "grad_norm": 4.375, "grad_norm_var": 0.0662750244140625, "learning_rate": 0.0001, "loss": 9.2672, "loss/crossentropy": 2.178554058074951, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3312181234359741, "step": 858 }, { "epoch": 0.05375, "grad_norm": 3.890625, "grad_norm_var": 0.06602274576822917, "learning_rate": 0.0001, "loss": 9.5052, "loss/crossentropy": 2.524282932281494, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.34150634706020355, "step": 860 }, { "epoch": 0.053875, "grad_norm": 3.921875, "grad_norm_var": 0.06665751139322916, "learning_rate": 0.0001, "loss": 9.4067, "loss/crossentropy": 2.345171332359314, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3292695879936218, "step": 862 }, { "epoch": 0.054, "grad_norm": 4.0625, "grad_norm_var": 0.05427144368489583, "learning_rate": 0.0001, "loss": 9.4267, "loss/crossentropy": 2.5034204721450806, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.34620046615600586, "step": 864 }, { "epoch": 0.054125, "grad_norm": 3.921875, "grad_norm_var": 0.05933837890625, "learning_rate": 0.0001, "loss": 9.6655, "loss/crossentropy": 2.609255313873291, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.38452892005443573, "step": 866 }, { "epoch": 0.05425, "grad_norm": 4.0625, "grad_norm_var": 0.054686482747395834, "learning_rate": 0.0001, "loss": 9.5649, "loss/crossentropy": 2.795304298400879, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3581002801656723, "step": 868 }, { "epoch": 0.054375, "grad_norm": 3.859375, "grad_norm_var": 0.18343098958333334, "learning_rate": 0.0001, "loss": 9.3991, "loss/crossentropy": 2.3437917232513428, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3472553938627243, "step": 870 }, { "epoch": 0.0545, "grad_norm": 5.375, "grad_norm_var": 0.2687459309895833, "learning_rate": 0.0001, "loss": 9.6406, "loss/crossentropy": 2.4949249029159546, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3250335156917572, "step": 872 }, { "epoch": 0.054625, "grad_norm": 3.9375, "grad_norm_var": 0.2698313395182292, "learning_rate": 0.0001, "loss": 9.4352, "loss/crossentropy": 2.646947979927063, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3375275284051895, "step": 874 }, { "epoch": 0.05475, "grad_norm": 4.4375, "grad_norm_var": 0.26324462890625, "learning_rate": 0.0001, "loss": 9.6272, "loss/crossentropy": 2.342916250228882, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.335741251707077, "step": 876 }, { "epoch": 0.054875, "grad_norm": 4.25, "grad_norm_var": 0.226904296875, "learning_rate": 0.0001, "loss": 9.521, "loss/crossentropy": 2.309291124343872, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.34289687871932983, "step": 878 }, { "epoch": 0.055, "grad_norm": 4.0625, "grad_norm_var": 0.22008056640625, "learning_rate": 0.0001, "loss": 9.6429, "loss/crossentropy": 2.363176941871643, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.31846094131469727, "step": 880 }, { "epoch": 0.055125, "grad_norm": 4.28125, "grad_norm_var": 0.21479390462239584, "learning_rate": 0.0001, "loss": 9.4278, "loss/crossentropy": 2.295005202293396, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.35759809613227844, "step": 882 }, { "epoch": 0.05525, "grad_norm": 4.09375, "grad_norm_var": 0.22467041015625, "learning_rate": 0.0001, "loss": 9.3845, "loss/crossentropy": 2.2038002014160156, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3040274381637573, "step": 884 }, { "epoch": 0.055375, "grad_norm": 4.375, "grad_norm_var": 0.11887919108072917, "learning_rate": 0.0001, "loss": 9.6735, "loss/crossentropy": 2.581911325454712, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3485991954803467, "step": 886 }, { "epoch": 0.0555, "grad_norm": 3.875, "grad_norm_var": 0.04658915201822917, "learning_rate": 0.0001, "loss": 9.399, "loss/crossentropy": 2.504314661026001, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3332435041666031, "step": 888 }, { "epoch": 0.055625, "grad_norm": 3.859375, "grad_norm_var": 0.05237630208333333, "learning_rate": 0.0001, "loss": 9.6586, "loss/crossentropy": 2.4632703065872192, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3592112809419632, "step": 890 }, { "epoch": 0.05575, "grad_norm": 4.4375, "grad_norm_var": 0.05908203125, "learning_rate": 0.0001, "loss": 9.5926, "loss/crossentropy": 2.600472331047058, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3421791195869446, "step": 892 }, { "epoch": 0.055875, "grad_norm": 3.78125, "grad_norm_var": 0.06067301432291667, "learning_rate": 0.0001, "loss": 9.3855, "loss/crossentropy": 2.1859222650527954, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.29888148605823517, "step": 894 }, { "epoch": 0.056, "grad_norm": 3.9375, "grad_norm_var": 0.05611572265625, "learning_rate": 0.0001, "loss": 9.4589, "loss/crossentropy": 2.3843421936035156, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3476228564977646, "step": 896 }, { "epoch": 0.056125, "grad_norm": 4.0, "grad_norm_var": 0.056477864583333336, "learning_rate": 0.0001, "loss": 9.3348, "loss/crossentropy": 2.3973230123519897, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3175063878297806, "step": 898 }, { "epoch": 0.05625, "grad_norm": 4.0, "grad_norm_var": 0.06223856608072917, "learning_rate": 0.0001, "loss": 9.5278, "loss/crossentropy": 2.350813627243042, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3350457102060318, "step": 900 }, { "epoch": 0.056375, "grad_norm": 4.5, "grad_norm_var": 0.0687652587890625, "learning_rate": 0.0001, "loss": 9.6984, "loss/crossentropy": 2.3941330909729004, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33253586292266846, "step": 902 }, { "epoch": 0.0565, "grad_norm": 3.984375, "grad_norm_var": 0.09226888020833333, "learning_rate": 0.0001, "loss": 9.686, "loss/crossentropy": 2.4759925603866577, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3297637850046158, "step": 904 }, { "epoch": 0.056625, "grad_norm": 4.34375, "grad_norm_var": 0.08549702962239583, "learning_rate": 0.0001, "loss": 9.5501, "loss/crossentropy": 2.643571376800537, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3501765877008438, "step": 906 }, { "epoch": 0.05675, "grad_norm": 4.0, "grad_norm_var": 0.06550191243489584, "learning_rate": 0.0001, "loss": 9.5891, "loss/crossentropy": 2.432242512702942, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3715456426143646, "step": 908 }, { "epoch": 0.056875, "grad_norm": 4.03125, "grad_norm_var": 0.059403483072916666, "learning_rate": 0.0001, "loss": 9.4375, "loss/crossentropy": 2.525635004043579, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.35497090220451355, "step": 910 }, { "epoch": 0.057, "grad_norm": 4.34375, "grad_norm_var": 0.0610504150390625, "learning_rate": 0.0001, "loss": 9.663, "loss/crossentropy": 2.6299631595611572, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.34647491574287415, "step": 912 }, { "epoch": 0.057125, "grad_norm": 4.3125, "grad_norm_var": 0.1096343994140625, "learning_rate": 0.0001, "loss": 9.2623, "loss/crossentropy": 2.2141228914260864, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.333205983042717, "step": 914 }, { "epoch": 0.05725, "grad_norm": 3.796875, "grad_norm_var": 0.1134765625, "learning_rate": 0.0001, "loss": 9.5108, "loss/crossentropy": 2.331273913383484, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.32548947632312775, "step": 916 }, { "epoch": 0.057375, "grad_norm": 3.765625, "grad_norm_var": 0.1252838134765625, "learning_rate": 0.0001, "loss": 9.3267, "loss/crossentropy": 2.20035183429718, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.31392902135849, "step": 918 }, { "epoch": 0.0575, "grad_norm": 4.53125, "grad_norm_var": 0.11096598307291666, "learning_rate": 0.0001, "loss": 9.4211, "loss/crossentropy": 2.4890637397766113, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3301192671060562, "step": 920 }, { "epoch": 0.057625, "grad_norm": 4.65625, "grad_norm_var": 0.1294921875, "learning_rate": 0.0001, "loss": 9.5324, "loss/crossentropy": 2.5583585500717163, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3586796224117279, "step": 922 }, { "epoch": 0.05775, "grad_norm": 4.03125, "grad_norm_var": 0.13170572916666667, "learning_rate": 0.0001, "loss": 9.7246, "loss/crossentropy": 2.417726516723633, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3519165962934494, "step": 924 }, { "epoch": 0.057875, "grad_norm": 4.1875, "grad_norm_var": 0.12793680826822917, "learning_rate": 0.0001, "loss": 9.3623, "loss/crossentropy": 2.4320497512817383, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.35020147264003754, "step": 926 }, { "epoch": 0.058, "grad_norm": 4.4375, "grad_norm_var": 0.14709879557291666, "learning_rate": 0.0001, "loss": 9.2962, "loss/crossentropy": 2.2121816873550415, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.32885268330574036, "step": 928 }, { "epoch": 0.058125, "grad_norm": 3.984375, "grad_norm_var": 0.08567606608072917, "learning_rate": 0.0001, "loss": 9.3099, "loss/crossentropy": 2.382021903991699, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.32922065258026123, "step": 930 }, { "epoch": 0.05825, "grad_norm": 3.890625, "grad_norm_var": 0.08528645833333333, "learning_rate": 0.0001, "loss": 9.4542, "loss/crossentropy": 2.2859452962875366, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3154686838388443, "step": 932 }, { "epoch": 0.058375, "grad_norm": 4.03125, "grad_norm_var": 0.07997945149739584, "learning_rate": 0.0001, "loss": 9.3008, "loss/crossentropy": 2.3658918142318726, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3251184970140457, "step": 934 }, { "epoch": 0.0585, "grad_norm": 3.75, "grad_norm_var": 0.07141825358072916, "learning_rate": 0.0001, "loss": 9.3537, "loss/crossentropy": 2.199766993522644, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3467453122138977, "step": 936 }, { "epoch": 0.058625, "grad_norm": 3.828125, "grad_norm_var": 0.0466796875, "learning_rate": 0.0001, "loss": 9.6814, "loss/crossentropy": 2.6493847370147705, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3443215787410736, "step": 938 }, { "epoch": 0.05875, "grad_norm": 4.0, "grad_norm_var": 0.04973551432291667, "learning_rate": 0.0001, "loss": 9.3843, "loss/crossentropy": 2.501352071762085, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3372286558151245, "step": 940 }, { "epoch": 0.058875, "grad_norm": 3.640625, "grad_norm_var": 0.06533203125, "learning_rate": 0.0001, "loss": 9.1373, "loss/crossentropy": 2.271903336048126, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3119770586490631, "step": 942 }, { "epoch": 0.059, "grad_norm": 3.9375, "grad_norm_var": 0.06785481770833333, "learning_rate": 0.0001, "loss": 9.5648, "loss/crossentropy": 2.495382308959961, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.35206887125968933, "step": 944 }, { "epoch": 0.059125, "grad_norm": 3.875, "grad_norm_var": 0.07141825358072916, "learning_rate": 0.0001, "loss": 9.4638, "loss/crossentropy": 2.582332134246826, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3388357609510422, "step": 946 }, { "epoch": 0.05925, "grad_norm": 4.0625, "grad_norm_var": 0.069580078125, "learning_rate": 0.0001, "loss": 9.3972, "loss/crossentropy": 2.7968443632125854, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.34314611554145813, "step": 948 }, { "epoch": 0.059375, "grad_norm": 3.953125, "grad_norm_var": 0.0709136962890625, "learning_rate": 0.0001, "loss": 9.5731, "loss/crossentropy": 2.361037015914917, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.31498147547245026, "step": 950 }, { "epoch": 0.0595, "grad_norm": 3.75, "grad_norm_var": 0.07116597493489583, "learning_rate": 0.0001, "loss": 9.4971, "loss/crossentropy": 2.493962287902832, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3326618820428848, "step": 952 }, { "epoch": 0.059625, "grad_norm": 3.734375, "grad_norm_var": 0.07270406087239584, "learning_rate": 0.0001, "loss": 9.5408, "loss/crossentropy": 2.4259244203567505, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3625700771808624, "step": 954 }, { "epoch": 0.05975, "grad_norm": 4.9375, "grad_norm_var": 0.12129618326822916, "learning_rate": 0.0001, "loss": 9.4081, "loss/crossentropy": 2.146342396736145, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3270118683576584, "step": 956 }, { "epoch": 0.059875, "grad_norm": 3.65625, "grad_norm_var": 0.1236968994140625, "learning_rate": 0.0001, "loss": 9.3212, "loss/crossentropy": 2.3437579870224, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.34136760234832764, "step": 958 }, { "epoch": 0.06, "grad_norm": 3.921875, "grad_norm_var": 0.11343485514322917, "learning_rate": 0.0001, "loss": 9.4308, "loss/crossentropy": 2.330640196800232, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3233296573162079, "step": 960 }, { "epoch": 0.060125, "grad_norm": 3.765625, "grad_norm_var": 0.11761067708333334, "learning_rate": 0.0001, "loss": 9.1553, "loss/crossentropy": 2.392879366874695, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.33645670115947723, "step": 962 }, { "epoch": 0.06025, "grad_norm": 3.484375, "grad_norm_var": 0.13583984375, "learning_rate": 0.0001, "loss": 9.2888, "loss/crossentropy": 2.2503827810287476, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3347347527742386, "step": 964 }, { "epoch": 0.060375, "grad_norm": 3.984375, "grad_norm_var": 0.13542378743489583, "learning_rate": 0.0001, "loss": 9.368, "loss/crossentropy": 2.4098986387252808, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3416828215122223, "step": 966 }, { "epoch": 0.0605, "grad_norm": 4.3125, "grad_norm_var": 0.13982645670572916, "learning_rate": 0.0001, "loss": 9.4573, "loss/crossentropy": 2.1524184942245483, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.30603043735027313, "step": 968 }, { "epoch": 0.060625, "grad_norm": 3.65625, "grad_norm_var": 0.14397786458333334, "learning_rate": 0.0001, "loss": 9.2709, "loss/crossentropy": 2.2244198322296143, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.317968025803566, "step": 970 }, { "epoch": 0.06075, "grad_norm": 3.5, "grad_norm_var": 0.08929036458333334, "learning_rate": 0.0001, "loss": 9.3768, "loss/crossentropy": 2.4069145917892456, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3366214781999588, "step": 972 }, { "epoch": 0.060875, "grad_norm": 3.765625, "grad_norm_var": 0.05461324055989583, "learning_rate": 0.0001, "loss": 9.0551, "loss/crossentropy": 2.486254572868347, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.2998432517051697, "step": 974 }, { "epoch": 0.061, "grad_norm": 4.59375, "grad_norm_var": 0.09365132649739584, "learning_rate": 0.0001, "loss": 9.3255, "loss/crossentropy": 2.443332552909851, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3492431044578552, "step": 976 }, { "epoch": 0.061125, "grad_norm": 4.0, "grad_norm_var": 0.09325764973958334, "learning_rate": 0.0001, "loss": 9.3259, "loss/crossentropy": 2.4328815937042236, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3370583653450012, "step": 978 }, { "epoch": 0.06125, "grad_norm": 3.859375, "grad_norm_var": 0.09388020833333334, "learning_rate": 0.0001, "loss": 9.1005, "loss/crossentropy": 2.373159170150757, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.31204234063625336, "step": 980 }, { "epoch": 0.061375, "grad_norm": 3.828125, "grad_norm_var": 0.09207255045572917, "learning_rate": 0.0001, "loss": 9.2474, "loss/crossentropy": 2.319578170776367, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3269862085580826, "step": 982 }, { "epoch": 0.0615, "grad_norm": 4.0625, "grad_norm_var": 0.08448893229166667, "learning_rate": 0.0001, "loss": 9.1644, "loss/crossentropy": 2.042439818382263, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.2966649830341339, "step": 984 }, { "epoch": 0.061625, "grad_norm": 3.890625, "grad_norm_var": 0.07712300618489583, "learning_rate": 0.0001, "loss": 9.354, "loss/crossentropy": 2.3522753715515137, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3372476100921631, "step": 986 }, { "epoch": 0.06175, "grad_norm": 3.890625, "grad_norm_var": 0.06454671223958333, "learning_rate": 0.0001, "loss": 9.4464, "loss/crossentropy": 2.3970394134521484, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3193608969449997, "step": 988 }, { "epoch": 0.061875, "grad_norm": 4.0, "grad_norm_var": 0.04579976399739583, "learning_rate": 0.0001, "loss": 9.5096, "loss/crossentropy": 2.4436655044555664, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3359542489051819, "step": 990 }, { "epoch": 0.062, "grad_norm": 4.0625, "grad_norm_var": 0.020531209309895833, "learning_rate": 0.0001, "loss": 9.6125, "loss/crossentropy": 2.5487223863601685, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3535960167646408, "step": 992 }, { "epoch": 0.062125, "grad_norm": 3.78125, "grad_norm_var": 0.023322550455729167, "learning_rate": 0.0001, "loss": 9.3969, "loss/crossentropy": 2.2758008241653442, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3227398693561554, "step": 994 }, { "epoch": 0.06225, "grad_norm": 4.0625, "grad_norm_var": 0.01099853515625, "learning_rate": 0.0001, "loss": 9.2286, "loss/crossentropy": 2.065541088581085, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.2921288013458252, "step": 996 }, { "epoch": 0.062375, "grad_norm": 4.15625, "grad_norm_var": 0.015135701497395833, "learning_rate": 0.0001, "loss": 9.565, "loss/crossentropy": 2.5905497074127197, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3195539563894272, "step": 998 }, { "epoch": 0.0625, "grad_norm": 4.09375, "grad_norm_var": 0.013719685872395833, "learning_rate": 0.0001, "loss": 9.4557, "loss/crossentropy": 2.491615653038025, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.348484605550766, "step": 1000 }, { "epoch": 0.062625, "grad_norm": 3.828125, "grad_norm_var": 0.018309529622395834, "learning_rate": 0.0001, "loss": 9.4021, "loss/crossentropy": 2.271575689315796, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3233788311481476, "step": 1002 }, { "epoch": 0.06275, "grad_norm": 3.984375, "grad_norm_var": 0.017585245768229167, "learning_rate": 0.0001, "loss": 9.2456, "loss/crossentropy": 2.1161099076271057, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3034187853336334, "step": 1004 }, { "epoch": 0.062875, "grad_norm": 3.609375, "grad_norm_var": 0.025520833333333333, "learning_rate": 0.0001, "loss": 9.3095, "loss/crossentropy": 2.369056463241577, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3199651837348938, "step": 1006 }, { "epoch": 0.063, "grad_norm": 4.1875, "grad_norm_var": 0.029157511393229165, "learning_rate": 0.0001, "loss": 9.267, "loss/crossentropy": 2.327115058898926, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.34022311866283417, "step": 1008 }, { "epoch": 0.063125, "grad_norm": 3.828125, "grad_norm_var": 0.030631510416666667, "learning_rate": 0.0001, "loss": 9.3698, "loss/crossentropy": 2.7188167572021484, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.33434274792671204, "step": 1010 }, { "epoch": 0.06325, "grad_norm": 4.21875, "grad_norm_var": 0.036295572916666664, "learning_rate": 0.0001, "loss": 9.5306, "loss/crossentropy": 2.2543774843215942, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3080340623855591, "step": 1012 }, { "epoch": 0.063375, "grad_norm": 3.9375, "grad_norm_var": 0.03961181640625, "learning_rate": 0.0001, "loss": 9.2311, "loss/crossentropy": 2.137963056564331, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3146566152572632, "step": 1014 }, { "epoch": 0.0635, "grad_norm": 3.875, "grad_norm_var": 0.03809305826822917, "learning_rate": 0.0001, "loss": 9.3595, "loss/crossentropy": 2.378168821334839, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3253423571586609, "step": 1016 }, { "epoch": 0.063625, "grad_norm": 3.90625, "grad_norm_var": 0.0347808837890625, "learning_rate": 0.0001, "loss": 9.1484, "loss/crossentropy": 2.2595038414001465, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3158324211835861, "step": 1018 }, { "epoch": 0.06375, "grad_norm": 3.9375, "grad_norm_var": 0.03615620930989583, "learning_rate": 0.0001, "loss": 9.2288, "loss/crossentropy": 2.4348455667495728, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3402617871761322, "step": 1020 }, { "epoch": 0.063875, "grad_norm": 4.21875, "grad_norm_var": 0.040299479166666666, "learning_rate": 0.0001, "loss": 9.4738, "loss/crossentropy": 2.4196285009384155, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3641415983438492, "step": 1022 }, { "epoch": 0.064, "grad_norm": 4.0, "grad_norm_var": 0.03557535807291667, "learning_rate": 0.0001, "loss": 9.3856, "loss/crossentropy": 2.7573235034942627, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3504706919193268, "step": 1024 }, { "epoch": 0.064125, "grad_norm": 3.921875, "grad_norm_var": 0.03811442057291667, "learning_rate": 0.0001, "loss": 9.4086, "loss/crossentropy": 2.39897620677948, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3188930004835129, "step": 1026 }, { "epoch": 0.06425, "grad_norm": 3.765625, "grad_norm_var": 0.044417317708333334, "learning_rate": 0.0001, "loss": 8.9954, "loss/crossentropy": 2.37536883354187, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.30067528784275055, "step": 1028 }, { "epoch": 0.064375, "grad_norm": 3.859375, "grad_norm_var": 0.05654195149739583, "learning_rate": 0.0001, "loss": 9.5045, "loss/crossentropy": 2.5579299926757812, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.34567518532276154, "step": 1030 }, { "epoch": 0.0645, "grad_norm": 3.84375, "grad_norm_var": 0.0560699462890625, "learning_rate": 0.0001, "loss": 9.497, "loss/crossentropy": 2.4573644399642944, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3271123617887497, "step": 1032 }, { "epoch": 0.064625, "grad_norm": 3.734375, "grad_norm_var": 0.13196512858072917, "learning_rate": 0.0001, "loss": 9.2878, "loss/crossentropy": 2.369649648666382, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.3534861207008362, "step": 1034 }, { "epoch": 0.06475, "grad_norm": 4.34375, "grad_norm_var": 0.1352447509765625, "learning_rate": 0.0001, "loss": 9.2811, "loss/crossentropy": 2.366884231567383, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3179771304130554, "step": 1036 }, { "epoch": 0.064875, "grad_norm": 4.28125, "grad_norm_var": 0.13723042805989583, "learning_rate": 0.0001, "loss": 9.245, "loss/crossentropy": 2.385068416595459, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.33948729932308197, "step": 1038 }, { "epoch": 0.065, "grad_norm": 3.953125, "grad_norm_var": 0.13835347493489583, "learning_rate": 0.0001, "loss": 9.3012, "loss/crossentropy": 2.4422179460525513, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3261062651872635, "step": 1040 }, { "epoch": 0.065125, "grad_norm": 3.890625, "grad_norm_var": 0.1297760009765625, "learning_rate": 0.0001, "loss": 9.2994, "loss/crossentropy": 2.393699288368225, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.31233613193035126, "step": 1042 }, { "epoch": 0.06525, "grad_norm": 3.90625, "grad_norm_var": 0.10129292805989583, "learning_rate": 0.0001, "loss": 9.4722, "loss/crossentropy": 2.4751322269439697, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.34922046959400177, "step": 1044 }, { "epoch": 0.065375, "grad_norm": 4.09375, "grad_norm_var": 0.10408528645833333, "learning_rate": 0.0001, "loss": 9.2301, "loss/crossentropy": 2.3055585622787476, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.332491010427475, "step": 1046 }, { "epoch": 0.0655, "grad_norm": 4.0, "grad_norm_var": 0.1025390625, "learning_rate": 0.0001, "loss": 9.356, "loss/crossentropy": 2.3319748640060425, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3209776282310486, "step": 1048 }, { "epoch": 0.065625, "grad_norm": 4.0625, "grad_norm_var": 0.0375396728515625, "learning_rate": 0.0001, "loss": 9.3369, "loss/crossentropy": 2.2262803316116333, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3301280736923218, "step": 1050 }, { "epoch": 0.06575, "grad_norm": 3.765625, "grad_norm_var": 0.041239420572916664, "learning_rate": 0.0001, "loss": 9.0379, "loss/crossentropy": 2.3847025632858276, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30600911378860474, "step": 1052 }, { "epoch": 0.065875, "grad_norm": 3.953125, "grad_norm_var": 0.03713785807291667, "learning_rate": 0.0001, "loss": 9.1241, "loss/crossentropy": 2.2389484643936157, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.31373530626296997, "step": 1054 }, { "epoch": 0.066, "grad_norm": 3.953125, "grad_norm_var": 0.04392801920572917, "learning_rate": 0.0001, "loss": 9.324, "loss/crossentropy": 2.4310184717178345, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3465813100337982, "step": 1056 }, { "epoch": 0.066125, "grad_norm": 3.546875, "grad_norm_var": 0.055475870768229164, "learning_rate": 0.0001, "loss": 8.9162, "loss/crossentropy": 2.394460439682007, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3166656345129013, "step": 1058 }, { "epoch": 0.06625, "grad_norm": 3.703125, "grad_norm_var": 0.052855428059895834, "learning_rate": 0.0001, "loss": 9.4262, "loss/crossentropy": 2.5295032262802124, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3500918596982956, "step": 1060 }, { "epoch": 0.066375, "grad_norm": 4.15625, "grad_norm_var": 0.051920572916666664, "learning_rate": 0.0001, "loss": 9.3493, "loss/crossentropy": 2.2424673438072205, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3131362199783325, "step": 1062 }, { "epoch": 0.0665, "grad_norm": 3.65625, "grad_norm_var": 0.04260660807291667, "learning_rate": 0.0001, "loss": 9.1694, "loss/crossentropy": 2.303846836090088, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3290853351354599, "step": 1064 }, { "epoch": 0.066625, "grad_norm": 3.734375, "grad_norm_var": 0.030060831705729166, "learning_rate": 0.0001, "loss": 9.1424, "loss/crossentropy": 2.417978286743164, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3248021900653839, "step": 1066 }, { "epoch": 0.06675, "grad_norm": 3.8125, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 9.2021, "loss/crossentropy": 2.1610294580459595, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.31622791290283203, "step": 1068 }, { "epoch": 0.066875, "grad_norm": 3.59375, "grad_norm_var": 0.029792277018229167, "learning_rate": 0.0001, "loss": 9.122, "loss/crossentropy": 2.254276156425476, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.29598017036914825, "step": 1070 }, { "epoch": 0.067, "grad_norm": 4.375, "grad_norm_var": 0.04895833333333333, "learning_rate": 0.0001, "loss": 9.5952, "loss/crossentropy": 2.485300302505493, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3379819989204407, "step": 1072 }, { "epoch": 0.067125, "grad_norm": 4.125, "grad_norm_var": 0.0529937744140625, "learning_rate": 0.0001, "loss": 9.3245, "loss/crossentropy": 2.337808847427368, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3292604982852936, "step": 1074 }, { "epoch": 0.06725, "grad_norm": 3.828125, "grad_norm_var": 0.05022786458333333, "learning_rate": 0.0001, "loss": 9.4473, "loss/crossentropy": 2.394913077354431, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.34427976608276367, "step": 1076 }, { "epoch": 0.067375, "grad_norm": 4.03125, "grad_norm_var": 0.05392252604166667, "learning_rate": 0.0001, "loss": 9.4114, "loss/crossentropy": 2.4923205375671387, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3431336283683777, "step": 1078 }, { "epoch": 0.0675, "grad_norm": 4.0, "grad_norm_var": 0.05250244140625, "learning_rate": 0.0001, "loss": 9.3676, "loss/crossentropy": 2.405529022216797, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.33384498953819275, "step": 1080 }, { "epoch": 0.067625, "grad_norm": 4.1875, "grad_norm_var": 0.06123758951822917, "learning_rate": 0.0001, "loss": 9.0372, "loss/crossentropy": 2.446845769882202, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.30565811693668365, "step": 1082 }, { "epoch": 0.06775, "grad_norm": 4.375, "grad_norm_var": 0.06928609212239584, "learning_rate": 0.0001, "loss": 9.4077, "loss/crossentropy": 2.481779932975769, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3120981603860855, "step": 1084 }, { "epoch": 0.067875, "grad_norm": 3.625, "grad_norm_var": 0.07662353515625, "learning_rate": 0.0001, "loss": 9.3184, "loss/crossentropy": 2.2164746522903442, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.29565760493278503, "step": 1086 }, { "epoch": 0.068, "grad_norm": 4.3125, "grad_norm_var": 0.0718170166015625, "learning_rate": 0.0001, "loss": 9.4184, "loss/crossentropy": 2.53956139087677, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.34126120805740356, "step": 1088 }, { "epoch": 0.068125, "grad_norm": 3.75, "grad_norm_var": 0.06967671712239583, "learning_rate": 0.0001, "loss": 9.3542, "loss/crossentropy": 2.4937098026275635, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.32088300585746765, "step": 1090 }, { "epoch": 0.06825, "grad_norm": 3.828125, "grad_norm_var": 0.06809895833333333, "learning_rate": 0.0001, "loss": 9.1749, "loss/crossentropy": 2.387190341949463, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.33582253754138947, "step": 1092 }, { "epoch": 0.068375, "grad_norm": 3.890625, "grad_norm_var": 0.061747233072916664, "learning_rate": 0.0001, "loss": 9.4676, "loss/crossentropy": 2.626872181892395, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.38073016703128815, "step": 1094 }, { "epoch": 0.0685, "grad_norm": 4.21875, "grad_norm_var": 0.06539306640625, "learning_rate": 0.0001, "loss": 9.3993, "loss/crossentropy": 2.449720859527588, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.338313028216362, "step": 1096 }, { "epoch": 0.068625, "grad_norm": 3.84375, "grad_norm_var": 0.12389322916666666, "learning_rate": 0.0001, "loss": 9.3835, "loss/crossentropy": 2.2490886449813843, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3486269563436508, "step": 1098 }, { "epoch": 0.06875, "grad_norm": 3.8125, "grad_norm_var": 0.11367899576822917, "learning_rate": 0.0001, "loss": 9.0693, "loss/crossentropy": 2.353515625, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3053932934999466, "step": 1100 }, { "epoch": 0.068875, "grad_norm": 3.953125, "grad_norm_var": 0.09433186848958333, "learning_rate": 0.0001, "loss": 9.2217, "loss/crossentropy": 2.1353343725204468, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31745678186416626, "step": 1102 }, { "epoch": 0.069, "grad_norm": 6.28125, "grad_norm_var": 0.4137980143229167, "learning_rate": 0.0001, "loss": 9.3661, "loss/crossentropy": 1.9615400433540344, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3933947682380676, "step": 1104 }, { "epoch": 0.069125, "grad_norm": 4.125, "grad_norm_var": 0.40084228515625, "learning_rate": 0.0001, "loss": 9.4, "loss/crossentropy": 2.4881348609924316, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.35493068397045135, "step": 1106 }, { "epoch": 0.06925, "grad_norm": 6.09375, "grad_norm_var": 0.6248372395833334, "learning_rate": 0.0001, "loss": 9.611, "loss/crossentropy": 2.47869610786438, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3342919647693634, "step": 1108 }, { "epoch": 0.069375, "grad_norm": 4.03125, "grad_norm_var": 0.628662109375, "learning_rate": 0.0001, "loss": 9.2084, "loss/crossentropy": 2.07807195186615, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.31260576844215393, "step": 1110 }, { "epoch": 0.0695, "grad_norm": 3.953125, "grad_norm_var": 0.6350260416666667, "learning_rate": 0.0001, "loss": 9.0765, "loss/crossentropy": 2.4429653882980347, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.34172505140304565, "step": 1112 }, { "epoch": 0.069625, "grad_norm": 3.75, "grad_norm_var": 0.6293253580729167, "learning_rate": 0.0001, "loss": 9.4565, "loss/crossentropy": 2.2236061096191406, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3183263838291168, "step": 1114 }, { "epoch": 0.06975, "grad_norm": 4.03125, "grad_norm_var": 0.611962890625, "learning_rate": 0.0001, "loss": 9.4014, "loss/crossentropy": 2.490137457847595, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.334211602807045, "step": 1116 }, { "epoch": 0.069875, "grad_norm": 3.671875, "grad_norm_var": 0.62593994140625, "learning_rate": 0.0001, "loss": 9.2132, "loss/crossentropy": 2.503127098083496, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3488759547472, "step": 1118 }, { "epoch": 0.07, "grad_norm": 4.0, "grad_norm_var": 0.38203125, "learning_rate": 0.0001, "loss": 9.0646, "loss/crossentropy": 2.588125228881836, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.32440805435180664, "step": 1120 }, { "epoch": 0.070125, "grad_norm": 5.1875, "grad_norm_var": 0.44537760416666666, "learning_rate": 0.0001, "loss": 9.6197, "loss/crossentropy": 2.4455296993255615, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.33201858401298523, "step": 1122 }, { "epoch": 0.07025, "grad_norm": 3.953125, "grad_norm_var": 0.1257232666015625, "learning_rate": 0.0001, "loss": 9.3346, "loss/crossentropy": 2.5608623027801514, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3407406657934189, "step": 1124 }, { "epoch": 0.070375, "grad_norm": 3.9375, "grad_norm_var": 0.12796122233072918, "learning_rate": 0.0001, "loss": 9.4367, "loss/crossentropy": 2.4927167892456055, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3708173930644989, "step": 1126 }, { "epoch": 0.0705, "grad_norm": 3.796875, "grad_norm_var": 0.14039306640625, "learning_rate": 0.0001, "loss": 9.3054, "loss/crossentropy": 2.3557698726654053, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30249859392642975, "step": 1128 }, { "epoch": 0.070625, "grad_norm": 4.0, "grad_norm_var": 0.13619791666666667, "learning_rate": 0.0001, "loss": 9.3421, "loss/crossentropy": 2.2762213945388794, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.31596677005290985, "step": 1130 }, { "epoch": 0.07075, "grad_norm": 3.484375, "grad_norm_var": 0.15147196451822917, "learning_rate": 0.0001, "loss": 9.1813, "loss/crossentropy": 2.3421590328216553, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3331995755434036, "step": 1132 }, { "epoch": 0.070875, "grad_norm": 4.34375, "grad_norm_var": 0.87095947265625, "learning_rate": 0.0001, "loss": 9.3163, "loss/crossentropy": 2.263655185699463, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3255555182695389, "step": 1134 }, { "epoch": 0.071, "grad_norm": 3.9375, "grad_norm_var": 0.8688761393229166, "learning_rate": 0.0001, "loss": 9.2828, "loss/crossentropy": 2.5721757411956787, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.346492663025856, "step": 1136 }, { "epoch": 0.071125, "grad_norm": 3.96875, "grad_norm_var": 0.7981608072916667, "learning_rate": 0.0001, "loss": 9.1755, "loss/crossentropy": 2.3569506406784058, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3098555654287338, "step": 1138 }, { "epoch": 0.07125, "grad_norm": 3.671875, "grad_norm_var": 0.8078684488932292, "learning_rate": 0.0001, "loss": 9.3109, "loss/crossentropy": 2.5194830894470215, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.33717748522758484, "step": 1140 }, { "epoch": 0.071375, "grad_norm": 3.765625, "grad_norm_var": 0.8381011962890625, "learning_rate": 0.0001, "loss": 9.0862, "loss/crossentropy": 2.352966547012329, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.31706100702285767, "step": 1142 }, { "epoch": 0.0715, "grad_norm": 4.03125, "grad_norm_var": 0.8229777018229166, "learning_rate": 0.0001, "loss": 9.0614, "loss/crossentropy": 2.36809766292572, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.30267640948295593, "step": 1144 }, { "epoch": 0.071625, "grad_norm": 3.765625, "grad_norm_var": 0.8300120035807291, "learning_rate": 0.0001, "loss": 9.2628, "loss/crossentropy": 2.2535144090652466, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3255711644887924, "step": 1146 }, { "epoch": 0.07175, "grad_norm": 4.25, "grad_norm_var": 0.8011027018229167, "learning_rate": 0.0001, "loss": 9.4743, "loss/crossentropy": 2.443315029144287, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3342190533876419, "step": 1148 }, { "epoch": 0.071875, "grad_norm": 3.578125, "grad_norm_var": 0.049860636393229164, "learning_rate": 0.0001, "loss": 9.0595, "loss/crossentropy": 2.3228014707565308, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3196340799331665, "step": 1150 }, { "epoch": 0.072, "grad_norm": 4.1875, "grad_norm_var": 0.05780843098958333, "learning_rate": 0.0001, "loss": 9.1367, "loss/crossentropy": 2.449334144592285, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3382084518671036, "step": 1152 }, { "epoch": 0.072125, "grad_norm": 4.40625, "grad_norm_var": 0.07054036458333333, "learning_rate": 0.0001, "loss": 9.4249, "loss/crossentropy": 2.2319095134735107, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3391086161136627, "step": 1154 }, { "epoch": 0.07225, "grad_norm": 4.4375, "grad_norm_var": 0.10032145182291667, "learning_rate": 0.0001, "loss": 9.2504, "loss/crossentropy": 2.5388150215148926, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3299136161804199, "step": 1156 }, { "epoch": 0.072375, "grad_norm": 3.453125, "grad_norm_var": 0.10137430826822917, "learning_rate": 0.0001, "loss": 8.8611, "loss/crossentropy": 2.317844271659851, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.28633375465869904, "step": 1158 }, { "epoch": 0.0725, "grad_norm": 3.828125, "grad_norm_var": 0.10498046875, "learning_rate": 0.0001, "loss": 9.2766, "loss/crossentropy": 2.5527602434158325, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.34664320945739746, "step": 1160 }, { "epoch": 0.072625, "grad_norm": 3.546875, "grad_norm_var": 0.11223856608072917, "learning_rate": 0.0001, "loss": 8.9242, "loss/crossentropy": 2.2007336616516113, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3022945523262024, "step": 1162 }, { "epoch": 0.07275, "grad_norm": 3.953125, "grad_norm_var": 0.1042388916015625, "learning_rate": 0.0001, "loss": 9.1566, "loss/crossentropy": 2.435725212097168, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2984570860862732, "step": 1164 }, { "epoch": 0.072875, "grad_norm": 3.765625, "grad_norm_var": 0.09881083170572917, "learning_rate": 0.0001, "loss": 9.3371, "loss/crossentropy": 2.3465352058410645, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.33409954607486725, "step": 1166 }, { "epoch": 0.073, "grad_norm": 3.65625, "grad_norm_var": 0.09781494140625, "learning_rate": 0.0001, "loss": 9.2003, "loss/crossentropy": 2.35371732711792, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3324984610080719, "step": 1168 }, { "epoch": 0.073125, "grad_norm": 3.9375, "grad_norm_var": 0.07768452962239583, "learning_rate": 0.0001, "loss": 9.2635, "loss/crossentropy": 2.619705319404602, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.33950985968112946, "step": 1170 }, { "epoch": 0.07325, "grad_norm": 3.515625, "grad_norm_var": 0.021922810872395834, "learning_rate": 0.0001, "loss": 9.1711, "loss/crossentropy": 2.089954376220703, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3136076331138611, "step": 1172 }, { "epoch": 0.073375, "grad_norm": 4.25, "grad_norm_var": 0.037474568684895834, "learning_rate": 0.0001, "loss": 9.357, "loss/crossentropy": 2.347720980644226, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3133264631032944, "step": 1174 }, { "epoch": 0.0735, "grad_norm": 3.734375, "grad_norm_var": 0.0374176025390625, "learning_rate": 0.0001, "loss": 9.0205, "loss/crossentropy": 2.1896666288375854, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.31842848658561707, "step": 1176 }, { "epoch": 0.073625, "grad_norm": 3.640625, "grad_norm_var": 0.035139973958333334, "learning_rate": 0.0001, "loss": 9.314, "loss/crossentropy": 2.5638844966888428, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3261349946260452, "step": 1178 }, { "epoch": 0.07375, "grad_norm": 3.703125, "grad_norm_var": 0.03573811848958333, "learning_rate": 0.0001, "loss": 9.0808, "loss/crossentropy": 2.2707711458206177, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3127121925354004, "step": 1180 }, { "epoch": 0.073875, "grad_norm": 3.734375, "grad_norm_var": 0.03573811848958333, "learning_rate": 0.0001, "loss": 9.2911, "loss/crossentropy": 2.4895883798599243, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3251212239265442, "step": 1182 }, { "epoch": 0.074, "grad_norm": 4.4375, "grad_norm_var": 0.06560872395833334, "learning_rate": 0.0001, "loss": 9.4185, "loss/crossentropy": 2.573387026786804, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.4014684557914734, "step": 1184 }, { "epoch": 0.074125, "grad_norm": 3.984375, "grad_norm_var": 0.06645406087239583, "learning_rate": 0.0001, "loss": 9.3622, "loss/crossentropy": 2.5062469244003296, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3334721326828003, "step": 1186 }, { "epoch": 0.07425, "grad_norm": 3.578125, "grad_norm_var": 0.0643707275390625, "learning_rate": 0.0001, "loss": 8.7469, "loss/crossentropy": 2.2451168298721313, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2801739275455475, "step": 1188 }, { "epoch": 0.074375, "grad_norm": 3.84375, "grad_norm_var": 0.05558980305989583, "learning_rate": 0.0001, "loss": 9.3391, "loss/crossentropy": 2.5548393726348877, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30821627378463745, "step": 1190 }, { "epoch": 0.0745, "grad_norm": 3.71875, "grad_norm_var": 0.05677083333333333, "learning_rate": 0.0001, "loss": 9.166, "loss/crossentropy": 2.4481059312820435, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.37263134121894836, "step": 1192 }, { "epoch": 0.074625, "grad_norm": 3.609375, "grad_norm_var": 0.05585835774739583, "learning_rate": 0.0001, "loss": 9.3454, "loss/crossentropy": 2.0716623067855835, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.305528461933136, "step": 1194 }, { "epoch": 0.07475, "grad_norm": 3.75, "grad_norm_var": 0.05478108723958333, "learning_rate": 0.0001, "loss": 9.1304, "loss/crossentropy": 2.4684784412384033, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.33831028640270233, "step": 1196 }, { "epoch": 0.074875, "grad_norm": 3.953125, "grad_norm_var": 0.05436909993489583, "learning_rate": 0.0001, "loss": 9.0607, "loss/crossentropy": 2.307690143585205, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3136231005191803, "step": 1198 }, { "epoch": 0.075, "grad_norm": 3.765625, "grad_norm_var": 0.03463134765625, "learning_rate": 0.0001, "loss": 9.526, "loss/crossentropy": 2.3969805240631104, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.33685119450092316, "step": 1200 }, { "epoch": 0.075125, "grad_norm": 4.125, "grad_norm_var": 0.03902587890625, "learning_rate": 0.0001, "loss": 9.3325, "loss/crossentropy": 2.4884891510009766, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.32767656445503235, "step": 1202 }, { "epoch": 0.07525, "grad_norm": 4.125, "grad_norm_var": 0.043440755208333334, "learning_rate": 0.0001, "loss": 9.1195, "loss/crossentropy": 2.2092502117156982, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3058091998100281, "step": 1204 }, { "epoch": 0.075375, "grad_norm": 3.640625, "grad_norm_var": 0.04667867024739583, "learning_rate": 0.0001, "loss": 9.4251, "loss/crossentropy": 2.6787761449813843, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3413826525211334, "step": 1206 }, { "epoch": 0.0755, "grad_norm": 3.90625, "grad_norm_var": 0.05217183430989583, "learning_rate": 0.0001, "loss": 8.9467, "loss/crossentropy": 2.5071334838867188, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3084910959005356, "step": 1208 }, { "epoch": 0.075625, "grad_norm": 3.421875, "grad_norm_var": 0.06331380208333333, "learning_rate": 0.0001, "loss": 9.3112, "loss/crossentropy": 2.4085506200790405, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.31537193059921265, "step": 1210 }, { "epoch": 0.07575, "grad_norm": 4.125, "grad_norm_var": 0.06617838541666667, "learning_rate": 0.0001, "loss": 9.025, "loss/crossentropy": 2.370519518852234, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3043440580368042, "step": 1212 }, { "epoch": 0.075875, "grad_norm": 3.484375, "grad_norm_var": 0.081591796875, "learning_rate": 0.0001, "loss": 8.7902, "loss/crossentropy": 2.093766450881958, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2689971700310707, "step": 1214 }, { "epoch": 0.076, "grad_norm": 3.75, "grad_norm_var": 0.068896484375, "learning_rate": 0.0001, "loss": 9.2334, "loss/crossentropy": 2.459157109260559, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3207740783691406, "step": 1216 }, { "epoch": 0.076125, "grad_norm": 3.390625, "grad_norm_var": 0.06848551432291666, "learning_rate": 0.0001, "loss": 9.2117, "loss/crossentropy": 2.3601810932159424, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3157646358013153, "step": 1218 }, { "epoch": 0.07625, "grad_norm": 3.5625, "grad_norm_var": 0.03772379557291667, "learning_rate": 0.0001, "loss": 9.262, "loss/crossentropy": 2.4909332990646362, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.30380991101264954, "step": 1220 }, { "epoch": 0.076375, "grad_norm": 3.765625, "grad_norm_var": 0.0382720947265625, "learning_rate": 0.0001, "loss": 9.1849, "loss/crossentropy": 2.6187103986740112, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3528241813182831, "step": 1222 }, { "epoch": 0.0765, "grad_norm": 3.53125, "grad_norm_var": 0.0363677978515625, "learning_rate": 0.0001, "loss": 8.9057, "loss/crossentropy": 2.1186509132385254, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.32527799904346466, "step": 1224 }, { "epoch": 0.076625, "grad_norm": 3.671875, "grad_norm_var": 0.032160441080729164, "learning_rate": 0.0001, "loss": 9.3395, "loss/crossentropy": 2.339973211288452, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.31841570138931274, "step": 1226 }, { "epoch": 0.07675, "grad_norm": 4.625, "grad_norm_var": 0.0916015625, "learning_rate": 0.0001, "loss": 9.3554, "loss/crossentropy": 2.5320791006088257, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.316191166639328, "step": 1228 }, { "epoch": 0.076875, "grad_norm": 3.671875, "grad_norm_var": 0.0856353759765625, "learning_rate": 0.0001, "loss": 8.8309, "loss/crossentropy": 2.2061994075775146, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.3057589828968048, "step": 1230 }, { "epoch": 0.077, "grad_norm": 3.40625, "grad_norm_var": 0.10224202473958334, "learning_rate": 0.0001, "loss": 8.9979, "loss/crossentropy": 2.3670225143432617, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30261367559432983, "step": 1232 }, { "epoch": 0.077125, "grad_norm": 3.46875, "grad_norm_var": 0.09973958333333334, "learning_rate": 0.0001, "loss": 9.1629, "loss/crossentropy": 2.434694290161133, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3391081690788269, "step": 1234 }, { "epoch": 0.07725, "grad_norm": 3.859375, "grad_norm_var": 0.10047098795572916, "learning_rate": 0.0001, "loss": 9.0251, "loss/crossentropy": 2.3634976148605347, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3024301528930664, "step": 1236 }, { "epoch": 0.077375, "grad_norm": 3.734375, "grad_norm_var": 0.17688802083333333, "learning_rate": 0.0001, "loss": 9.3899, "loss/crossentropy": 2.307217240333557, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3346613794565201, "step": 1238 }, { "epoch": 0.0775, "grad_norm": 3.75, "grad_norm_var": 0.16782124837239584, "learning_rate": 0.0001, "loss": 9.0753, "loss/crossentropy": 2.18193256855011, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3203928619623184, "step": 1240 }, { "epoch": 0.077625, "grad_norm": 3.53125, "grad_norm_var": 0.17125244140625, "learning_rate": 0.0001, "loss": 8.8909, "loss/crossentropy": 2.4916880130767822, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.302780881524086, "step": 1242 }, { "epoch": 0.07775, "grad_norm": 12.9375, "grad_norm_var": 6.389351399739583, "learning_rate": 0.0001, "loss": 9.83, "loss/crossentropy": 2.3125792741775513, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.33007092773914337, "step": 1244 }, { "epoch": 0.077875, "grad_norm": 4.625, "grad_norm_var": 46.981346638997394, "learning_rate": 0.0001, "loss": 9.6705, "loss/crossentropy": 2.3381155729293823, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.4021202623844147, "step": 1246 }, { "epoch": 0.078, "grad_norm": 3.84375, "grad_norm_var": 46.28162333170573, "learning_rate": 0.0001, "loss": 9.3695, "loss/crossentropy": 2.440253973007202, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3352806717157364, "step": 1248 }, { "epoch": 0.078125, "grad_norm": 3.625, "grad_norm_var": 46.24781494140625, "learning_rate": 0.0001, "loss": 9.1471, "loss/crossentropy": 2.3579354286193848, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.30118629336357117, "step": 1250 }, { "epoch": 0.07825, "grad_norm": 3.578125, "grad_norm_var": 46.2870595296224, "learning_rate": 0.0001, "loss": 9.1803, "loss/crossentropy": 2.203198552131653, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.32657940685749054, "step": 1252 }, { "epoch": 0.078375, "grad_norm": 3.765625, "grad_norm_var": 46.582112630208336, "learning_rate": 0.0001, "loss": 9.1639, "loss/crossentropy": 2.3819659948349, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3207147717475891, "step": 1254 }, { "epoch": 0.0785, "grad_norm": 3.71875, "grad_norm_var": 46.60387369791667, "learning_rate": 0.0001, "loss": 9.3193, "loss/crossentropy": 2.3398979902267456, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.31140096485614777, "step": 1256 }, { "epoch": 0.078625, "grad_norm": 3.75, "grad_norm_var": 46.48379618326823, "learning_rate": 0.0001, "loss": 9.2251, "loss/crossentropy": 2.349861264228821, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.30966590344905853, "step": 1258 }, { "epoch": 0.07875, "grad_norm": 3.59375, "grad_norm_var": 43.54053446451823, "learning_rate": 0.0001, "loss": 9.0854, "loss/crossentropy": 2.361837148666382, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3148733079433441, "step": 1260 }, { "epoch": 0.078875, "grad_norm": 3.6875, "grad_norm_var": 0.15891825358072917, "learning_rate": 0.0001, "loss": 9.0902, "loss/crossentropy": 2.3919564485549927, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3144863545894623, "step": 1262 }, { "epoch": 0.079, "grad_norm": 4.09375, "grad_norm_var": 0.0195709228515625, "learning_rate": 0.0001, "loss": 9.1359, "loss/crossentropy": 2.456905484199524, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.34087100625038147, "step": 1264 }, { "epoch": 0.079125, "grad_norm": 4.09375, "grad_norm_var": 0.03790690104166667, "learning_rate": 0.0001, "loss": 9.2885, "loss/crossentropy": 2.256826400756836, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30895647406578064, "step": 1266 }, { "epoch": 0.07925, "grad_norm": 3.765625, "grad_norm_var": 0.21834208170572916, "learning_rate": 0.0001, "loss": 9.2081, "loss/crossentropy": 2.0980992913246155, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.33980080485343933, "step": 1268 }, { "epoch": 0.079375, "grad_norm": 3.828125, "grad_norm_var": 0.22421875, "learning_rate": 0.0001, "loss": 8.9036, "loss/crossentropy": 2.381914973258972, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3259545713663101, "step": 1270 }, { "epoch": 0.0795, "grad_norm": 4.0625, "grad_norm_var": 0.22944234212239584, "learning_rate": 0.0001, "loss": 9.1066, "loss/crossentropy": 2.3197826147079468, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3119906038045883, "step": 1272 }, { "epoch": 0.079625, "grad_norm": 4.25, "grad_norm_var": 0.23908589680989584, "learning_rate": 0.0001, "loss": 9.189, "loss/crossentropy": 2.385148286819458, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.324929416179657, "step": 1274 }, { "epoch": 0.07975, "grad_norm": 3.84375, "grad_norm_var": 0.2331207275390625, "learning_rate": 0.0001, "loss": 9.428, "loss/crossentropy": 2.334364414215088, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.32778996229171753, "step": 1276 }, { "epoch": 0.079875, "grad_norm": 3.421875, "grad_norm_var": 0.24498291015625, "learning_rate": 0.0001, "loss": 8.8294, "loss/crossentropy": 2.047423243522644, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2832287549972534, "step": 1278 }, { "epoch": 0.08, "grad_norm": 3.78125, "grad_norm_var": 0.23700764973958333, "learning_rate": 0.0001, "loss": 8.8972, "loss/crossentropy": 2.3135321140289307, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.2986491918563843, "step": 1280 }, { "epoch": 0.080125, "grad_norm": 3.609375, "grad_norm_var": 0.24149983723958332, "learning_rate": 0.0001, "loss": 9.1365, "loss/crossentropy": 2.4352437257766724, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3260779529809952, "step": 1282 }, { "epoch": 0.08025, "grad_norm": 3.4375, "grad_norm_var": 0.06049702962239583, "learning_rate": 0.0001, "loss": 8.73, "loss/crossentropy": 2.3086354732513428, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3060029596090317, "step": 1284 }, { "epoch": 0.080375, "grad_norm": 3.796875, "grad_norm_var": 0.05864156087239583, "learning_rate": 0.0001, "loss": 8.9689, "loss/crossentropy": 2.385580539703369, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.31818532943725586, "step": 1286 }, { "epoch": 0.0805, "grad_norm": 3.46875, "grad_norm_var": 0.056982421875, "learning_rate": 0.0001, "loss": 9.0075, "loss/crossentropy": 2.6208995580673218, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3344912976026535, "step": 1288 }, { "epoch": 0.080625, "grad_norm": 3.578125, "grad_norm_var": 0.039286295572916664, "learning_rate": 0.0001, "loss": 8.9292, "loss/crossentropy": 2.2690337896347046, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3385104089975357, "step": 1290 }, { "epoch": 0.08075, "grad_norm": 3.5, "grad_norm_var": 0.02431640625, "learning_rate": 0.0001, "loss": 8.6751, "loss/crossentropy": 2.236150622367859, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3237984627485275, "step": 1292 }, { "epoch": 0.080875, "grad_norm": 4.09375, "grad_norm_var": 0.03980712890625, "learning_rate": 0.0001, "loss": 9.0435, "loss/crossentropy": 2.26598197221756, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31261470913887024, "step": 1294 }, { "epoch": 0.081, "grad_norm": 3.84375, "grad_norm_var": 0.039305623372395834, "learning_rate": 0.0001, "loss": 9.2676, "loss/crossentropy": 2.382360100746155, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.31899434328079224, "step": 1296 }, { "epoch": 0.081125, "grad_norm": 3.78125, "grad_norm_var": 0.041991170247395834, "learning_rate": 0.0001, "loss": 9.036, "loss/crossentropy": 2.253870368003845, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3281244486570358, "step": 1298 }, { "epoch": 0.08125, "grad_norm": 3.640625, "grad_norm_var": 0.03935546875, "learning_rate": 0.0001, "loss": 8.9191, "loss/crossentropy": 2.225800395011902, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.32163363695144653, "step": 1300 }, { "epoch": 0.081375, "grad_norm": 3.78125, "grad_norm_var": 0.036844889322916664, "learning_rate": 0.0001, "loss": 9.0865, "loss/crossentropy": 2.4340925216674805, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3115888386964798, "step": 1302 }, { "epoch": 0.0815, "grad_norm": 3.8125, "grad_norm_var": 0.03950907389322917, "learning_rate": 0.0001, "loss": 9.1363, "loss/crossentropy": 2.3242989778518677, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.31321677565574646, "step": 1304 }, { "epoch": 0.081625, "grad_norm": 3.453125, "grad_norm_var": 0.04324544270833333, "learning_rate": 0.0001, "loss": 8.9641, "loss/crossentropy": 2.333641767501831, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3083188980817795, "step": 1306 }, { "epoch": 0.08175, "grad_norm": 3.8125, "grad_norm_var": 0.041552734375, "learning_rate": 0.0001, "loss": 8.829, "loss/crossentropy": 2.2750980854034424, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.30787941813468933, "step": 1308 }, { "epoch": 0.081875, "grad_norm": 3.59375, "grad_norm_var": 0.0253326416015625, "learning_rate": 0.0001, "loss": 8.8402, "loss/crossentropy": 2.2719295024871826, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2763071060180664, "step": 1310 }, { "epoch": 0.082, "grad_norm": 3.65625, "grad_norm_var": 0.021533203125, "learning_rate": 0.0001, "loss": 9.0406, "loss/crossentropy": 2.6696739196777344, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.32642635703086853, "step": 1312 }, { "epoch": 0.082125, "grad_norm": 3.40625, "grad_norm_var": 0.023346964518229166, "learning_rate": 0.0001, "loss": 8.929, "loss/crossentropy": 2.4896806478500366, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.32756727933883667, "step": 1314 }, { "epoch": 0.08225, "grad_norm": 3.78125, "grad_norm_var": 0.025617472330729165, "learning_rate": 0.0001, "loss": 9.2108, "loss/crossentropy": 2.384082555770874, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.31247396767139435, "step": 1316 }, { "epoch": 0.082375, "grad_norm": 3.71875, "grad_norm_var": 0.02974853515625, "learning_rate": 0.0001, "loss": 9.2751, "loss/crossentropy": 2.357794165611267, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31041818857192993, "step": 1318 }, { "epoch": 0.0825, "grad_norm": 3.90625, "grad_norm_var": 0.028645833333333332, "learning_rate": 0.0001, "loss": 8.9023, "loss/crossentropy": 2.4072389602661133, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.33170604705810547, "step": 1320 }, { "epoch": 0.082625, "grad_norm": 3.53125, "grad_norm_var": 0.034357706705729164, "learning_rate": 0.0001, "loss": 8.9485, "loss/crossentropy": 2.239370107650757, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.29322461783885956, "step": 1322 }, { "epoch": 0.08275, "grad_norm": 3.546875, "grad_norm_var": 0.03426005045572917, "learning_rate": 0.0001, "loss": 9.035, "loss/crossentropy": 2.5135509967803955, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3100414276123047, "step": 1324 }, { "epoch": 0.082875, "grad_norm": 3.953125, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 8.9812, "loss/crossentropy": 2.1832433342933655, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3234194219112396, "step": 1326 }, { "epoch": 0.083, "grad_norm": 3.546875, "grad_norm_var": 0.04663798014322917, "learning_rate": 0.0001, "loss": 8.8827, "loss/crossentropy": 2.4349963665008545, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30108973383903503, "step": 1328 }, { "epoch": 0.083125, "grad_norm": 3.6875, "grad_norm_var": 0.042023722330729166, "learning_rate": 0.0001, "loss": 9.2632, "loss/crossentropy": 2.4420056343078613, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3511925935745239, "step": 1330 }, { "epoch": 0.08325, "grad_norm": 3.484375, "grad_norm_var": 0.04169820149739583, "learning_rate": 0.0001, "loss": 8.6792, "loss/crossentropy": 2.1621901988983154, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2893373519182205, "step": 1332 }, { "epoch": 0.083375, "grad_norm": 3.59375, "grad_norm_var": 0.036554972330729164, "learning_rate": 0.0001, "loss": 8.8182, "loss/crossentropy": 2.2495174407958984, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29064077138900757, "step": 1334 }, { "epoch": 0.0835, "grad_norm": 3.84375, "grad_norm_var": 0.036896769205729166, "learning_rate": 0.0001, "loss": 8.9092, "loss/crossentropy": 2.2303038835525513, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3096499443054199, "step": 1336 }, { "epoch": 0.083625, "grad_norm": 4.53125, "grad_norm_var": 0.08069661458333334, "learning_rate": 0.0001, "loss": 9.1106, "loss/crossentropy": 2.56923508644104, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31639058887958527, "step": 1338 }, { "epoch": 0.08375, "grad_norm": 3.484375, "grad_norm_var": 0.0837066650390625, "learning_rate": 0.0001, "loss": 8.833, "loss/crossentropy": 2.2457833290100098, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2985825538635254, "step": 1340 }, { "epoch": 0.083875, "grad_norm": 3.359375, "grad_norm_var": 0.08302408854166667, "learning_rate": 0.0001, "loss": 8.9599, "loss/crossentropy": 2.405690312385559, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30246302485466003, "step": 1342 }, { "epoch": 0.084, "grad_norm": 3.65625, "grad_norm_var": 0.08238525390625, "learning_rate": 0.0001, "loss": 8.9099, "loss/crossentropy": 2.360277771949768, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3205329477787018, "step": 1344 }, { "epoch": 0.084125, "grad_norm": 3.578125, "grad_norm_var": 0.08232421875, "learning_rate": 0.0001, "loss": 9.0251, "loss/crossentropy": 2.4718183279037476, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.322445809841156, "step": 1346 }, { "epoch": 0.08425, "grad_norm": 3.78125, "grad_norm_var": 0.08196512858072917, "learning_rate": 0.0001, "loss": 8.9674, "loss/crossentropy": 2.1336525678634644, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2891063690185547, "step": 1348 }, { "epoch": 0.084375, "grad_norm": 3.671875, "grad_norm_var": 0.08079020182291667, "learning_rate": 0.0001, "loss": 8.7962, "loss/crossentropy": 2.443579316139221, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.32270699739456177, "step": 1350 }, { "epoch": 0.0845, "grad_norm": 3.4375, "grad_norm_var": 0.07742513020833333, "learning_rate": 0.0001, "loss": 8.8363, "loss/crossentropy": 2.47275173664093, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31648723781108856, "step": 1352 }, { "epoch": 0.084625, "grad_norm": 3.5625, "grad_norm_var": 0.024039713541666667, "learning_rate": 0.0001, "loss": 8.7411, "loss/crossentropy": 2.4415959119796753, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.286969318985939, "step": 1354 }, { "epoch": 0.08475, "grad_norm": 3.71875, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 8.8614, "loss/crossentropy": 2.2840429544448853, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2909524738788605, "step": 1356 }, { "epoch": 0.084875, "grad_norm": 3.609375, "grad_norm_var": 0.016942342122395832, "learning_rate": 0.0001, "loss": 8.7727, "loss/crossentropy": 2.0146145820617676, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.30589647591114044, "step": 1358 }, { "epoch": 0.085, "grad_norm": 3.921875, "grad_norm_var": 0.021906534830729168, "learning_rate": 0.0001, "loss": 9.0598, "loss/crossentropy": 2.3537509441375732, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.32555970549583435, "step": 1360 }, { "epoch": 0.085125, "grad_norm": 4.25, "grad_norm_var": 0.048924763997395836, "learning_rate": 0.0001, "loss": 9.1544, "loss/crossentropy": 2.626122832298279, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3384062200784683, "step": 1362 }, { "epoch": 0.08525, "grad_norm": 3.53125, "grad_norm_var": 0.04871419270833333, "learning_rate": 0.0001, "loss": 8.8179, "loss/crossentropy": 2.288867473602295, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3119974285364151, "step": 1364 }, { "epoch": 0.085375, "grad_norm": 4.6875, "grad_norm_var": 0.12111002604166667, "learning_rate": 0.0001, "loss": 9.0527, "loss/crossentropy": 2.3565372228622437, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.309598833322525, "step": 1366 }, { "epoch": 0.0855, "grad_norm": 3.65625, "grad_norm_var": 0.11802978515625, "learning_rate": 0.0001, "loss": 9.0603, "loss/crossentropy": 2.361824154853821, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.32866616547107697, "step": 1368 }, { "epoch": 0.085625, "grad_norm": 3.78125, "grad_norm_var": 0.1112457275390625, "learning_rate": 0.0001, "loss": 9.1643, "loss/crossentropy": 2.2250397205352783, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3138546347618103, "step": 1370 }, { "epoch": 0.08575, "grad_norm": 3.90625, "grad_norm_var": 0.11529541015625, "learning_rate": 0.0001, "loss": 9.2611, "loss/crossentropy": 2.5242254734039307, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.33204028010368347, "step": 1372 }, { "epoch": 0.085875, "grad_norm": 3.5, "grad_norm_var": 0.11106669108072917, "learning_rate": 0.0001, "loss": 8.9304, "loss/crossentropy": 2.1475495100021362, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3122798055410385, "step": 1374 }, { "epoch": 0.086, "grad_norm": 3.609375, "grad_norm_var": 0.11819661458333333, "learning_rate": 0.0001, "loss": 8.9845, "loss/crossentropy": 2.3691786527633667, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.33019253611564636, "step": 1376 }, { "epoch": 0.086125, "grad_norm": 3.875, "grad_norm_var": 0.09970703125, "learning_rate": 0.0001, "loss": 8.8909, "loss/crossentropy": 2.491925835609436, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.31182408332824707, "step": 1378 }, { "epoch": 0.08625, "grad_norm": 4.03125, "grad_norm_var": 0.0946929931640625, "learning_rate": 0.0001, "loss": 9.0368, "loss/crossentropy": 2.531667709350586, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.31533099710941315, "step": 1380 }, { "epoch": 0.086375, "grad_norm": 3.484375, "grad_norm_var": 0.0412750244140625, "learning_rate": 0.0001, "loss": 8.9587, "loss/crossentropy": 2.4312853813171387, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3191726803779602, "step": 1382 }, { "epoch": 0.0865, "grad_norm": 3.53125, "grad_norm_var": 0.037007649739583336, "learning_rate": 0.0001, "loss": 8.9345, "loss/crossentropy": 2.3028546571731567, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.29265616834163666, "step": 1384 }, { "epoch": 0.086625, "grad_norm": 3.84375, "grad_norm_var": 0.03870442708333333, "learning_rate": 0.0001, "loss": 9.0492, "loss/crossentropy": 2.363860607147217, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3737258017063141, "step": 1386 }, { "epoch": 0.08675, "grad_norm": 3.625, "grad_norm_var": 0.031245930989583334, "learning_rate": 0.0001, "loss": 8.9288, "loss/crossentropy": 2.317563533782959, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3253827840089798, "step": 1388 }, { "epoch": 0.086875, "grad_norm": 3.734375, "grad_norm_var": 0.027018229166666668, "learning_rate": 0.0001, "loss": 8.9841, "loss/crossentropy": 2.1043782234191895, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.32640159130096436, "step": 1390 }, { "epoch": 0.087, "grad_norm": 3.8125, "grad_norm_var": 0.0204742431640625, "learning_rate": 0.0001, "loss": 9.0368, "loss/crossentropy": 2.4259365797042847, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.30903398990631104, "step": 1392 }, { "epoch": 0.087125, "grad_norm": 3.828125, "grad_norm_var": 0.019205729166666668, "learning_rate": 0.0001, "loss": 9.0183, "loss/crossentropy": 2.4740456342697144, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3079090714454651, "step": 1394 }, { "epoch": 0.08725, "grad_norm": 4.125, "grad_norm_var": 0.023177083333333334, "learning_rate": 0.0001, "loss": 9.0947, "loss/crossentropy": 2.3828792572021484, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3404559940099716, "step": 1396 }, { "epoch": 0.087375, "grad_norm": 4.21875, "grad_norm_var": 0.03254292805989583, "learning_rate": 0.0001, "loss": 8.9672, "loss/crossentropy": 2.445147395133972, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3140483498573303, "step": 1398 }, { "epoch": 0.0875, "grad_norm": 3.40625, "grad_norm_var": 0.041792805989583334, "learning_rate": 0.0001, "loss": 8.9434, "loss/crossentropy": 2.4013755321502686, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.31101924180984497, "step": 1400 }, { "epoch": 0.087625, "grad_norm": 3.375, "grad_norm_var": 0.05074462890625, "learning_rate": 0.0001, "loss": 8.81, "loss/crossentropy": 2.1755102276802063, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2810830771923065, "step": 1402 }, { "epoch": 0.08775, "grad_norm": 3.5, "grad_norm_var": 0.06453450520833333, "learning_rate": 0.0001, "loss": 8.95, "loss/crossentropy": 2.576531767845154, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.33312951028347015, "step": 1404 }, { "epoch": 0.087875, "grad_norm": 3.515625, "grad_norm_var": 0.06685791015625, "learning_rate": 0.0001, "loss": 9.0711, "loss/crossentropy": 2.418603301048279, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.327798455953598, "step": 1406 }, { "epoch": 0.088, "grad_norm": 3.609375, "grad_norm_var": 0.0658355712890625, "learning_rate": 0.0001, "loss": 9.2586, "loss/crossentropy": 2.6568247079849243, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3096371293067932, "step": 1408 }, { "epoch": 0.088125, "grad_norm": 3.8125, "grad_norm_var": 0.0717437744140625, "learning_rate": 0.0001, "loss": 8.864, "loss/crossentropy": 2.04031902551651, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3162307143211365, "step": 1410 }, { "epoch": 0.08825, "grad_norm": 4.09375, "grad_norm_var": 0.06897786458333334, "learning_rate": 0.0001, "loss": 9.1645, "loss/crossentropy": 2.5326061248779297, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.33072784543037415, "step": 1412 }, { "epoch": 0.088375, "grad_norm": 3.640625, "grad_norm_var": 0.04348551432291667, "learning_rate": 0.0001, "loss": 9.1564, "loss/crossentropy": 2.619969367980957, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.32924045622348785, "step": 1414 }, { "epoch": 0.0885, "grad_norm": 3.984375, "grad_norm_var": 0.051488240559895836, "learning_rate": 0.0001, "loss": 9.0071, "loss/crossentropy": 2.535549759864807, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3203812837600708, "step": 1416 }, { "epoch": 0.088625, "grad_norm": 3.765625, "grad_norm_var": 0.05335286458333333, "learning_rate": 0.0001, "loss": 9.0727, "loss/crossentropy": 2.4984034299850464, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.30122537910938263, "step": 1418 }, { "epoch": 0.08875, "grad_norm": 3.46875, "grad_norm_var": 0.04179585774739583, "learning_rate": 0.0001, "loss": 8.8205, "loss/crossentropy": 2.3398995399475098, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.30119411647319794, "step": 1420 }, { "epoch": 0.088875, "grad_norm": 3.59375, "grad_norm_var": 0.03720296223958333, "learning_rate": 0.0001, "loss": 8.8969, "loss/crossentropy": 2.265345811843872, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2919570058584213, "step": 1422 }, { "epoch": 0.089, "grad_norm": 3.625, "grad_norm_var": 0.03683268229166667, "learning_rate": 0.0001, "loss": 8.9852, "loss/crossentropy": 2.3537477254867554, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31161582469940186, "step": 1424 }, { "epoch": 0.089125, "grad_norm": 3.46875, "grad_norm_var": 0.029442342122395833, "learning_rate": 0.0001, "loss": 8.9986, "loss/crossentropy": 2.4521849155426025, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.29275740683078766, "step": 1426 }, { "epoch": 0.08925, "grad_norm": 3.71875, "grad_norm_var": 0.021434529622395834, "learning_rate": 0.0001, "loss": 8.9503, "loss/crossentropy": 2.2590759992599487, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3151112347841263, "step": 1428 }, { "epoch": 0.089375, "grad_norm": 3.625, "grad_norm_var": 0.02408447265625, "learning_rate": 0.0001, "loss": 9.0334, "loss/crossentropy": 2.431664228439331, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3339419513940811, "step": 1430 }, { "epoch": 0.0895, "grad_norm": 3.796875, "grad_norm_var": 0.021467081705729165, "learning_rate": 0.0001, "loss": 9.0671, "loss/crossentropy": 2.2652631998062134, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3316595107316971, "step": 1432 }, { "epoch": 0.089625, "grad_norm": 3.78125, "grad_norm_var": 0.028034464518229166, "learning_rate": 0.0001, "loss": 8.8634, "loss/crossentropy": 2.3894190788269043, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.30771124362945557, "step": 1434 }, { "epoch": 0.08975, "grad_norm": 3.4375, "grad_norm_var": 0.03173421223958333, "learning_rate": 0.0001, "loss": 8.8962, "loss/crossentropy": 2.174624800682068, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2859848737716675, "step": 1436 }, { "epoch": 0.089875, "grad_norm": 3.703125, "grad_norm_var": 0.0318267822265625, "learning_rate": 0.0001, "loss": 9.0001, "loss/crossentropy": 2.4872666597366333, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.29487521946430206, "step": 1438 }, { "epoch": 0.09, "grad_norm": 3.703125, "grad_norm_var": 0.03215230305989583, "learning_rate": 0.0001, "loss": 8.8425, "loss/crossentropy": 2.276022791862488, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28920166194438934, "step": 1440 }, { "epoch": 0.090125, "grad_norm": 3.421875, "grad_norm_var": 0.03479715983072917, "learning_rate": 0.0001, "loss": 8.966, "loss/crossentropy": 2.4065338373184204, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2963782846927643, "step": 1442 }, { "epoch": 0.09025, "grad_norm": 3.359375, "grad_norm_var": 0.041243489583333334, "learning_rate": 0.0001, "loss": 8.9235, "loss/crossentropy": 2.52418851852417, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3200630098581314, "step": 1444 }, { "epoch": 0.090375, "grad_norm": 3.296875, "grad_norm_var": 0.04114176432291667, "learning_rate": 0.0001, "loss": 8.8564, "loss/crossentropy": 2.454889178276062, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3129142075777054, "step": 1446 }, { "epoch": 0.0905, "grad_norm": 3.75, "grad_norm_var": 0.039839680989583334, "learning_rate": 0.0001, "loss": 8.903, "loss/crossentropy": 2.3371113538742065, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.294046014547348, "step": 1448 }, { "epoch": 0.090625, "grad_norm": 4.1875, "grad_norm_var": 0.052643839518229166, "learning_rate": 0.0001, "loss": 8.8963, "loss/crossentropy": 2.4554747343063354, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2924615442752838, "step": 1450 }, { "epoch": 0.09075, "grad_norm": 3.46875, "grad_norm_var": 0.0498687744140625, "learning_rate": 0.0001, "loss": 8.9145, "loss/crossentropy": 2.2903236150741577, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.2915424406528473, "step": 1452 }, { "epoch": 0.090875, "grad_norm": 3.5, "grad_norm_var": 0.05797119140625, "learning_rate": 0.0001, "loss": 8.8917, "loss/crossentropy": 2.3047916889190674, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.29666368663311005, "step": 1454 }, { "epoch": 0.091, "grad_norm": 3.578125, "grad_norm_var": 0.055517578125, "learning_rate": 0.0001, "loss": 8.815, "loss/crossentropy": 2.681693911552429, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3014596551656723, "step": 1456 }, { "epoch": 0.091125, "grad_norm": 3.90625, "grad_norm_var": 0.8339914957682292, "learning_rate": 0.0001, "loss": 9.1305, "loss/crossentropy": 2.4676531553268433, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.37058068811893463, "step": 1458 }, { "epoch": 0.09125, "grad_norm": 3.5625, "grad_norm_var": 0.8101847330729167, "learning_rate": 0.0001, "loss": 8.9791, "loss/crossentropy": 2.41966712474823, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31916917860507965, "step": 1460 }, { "epoch": 0.091375, "grad_norm": 4.09375, "grad_norm_var": 0.7877349853515625, "learning_rate": 0.0001, "loss": 8.874, "loss/crossentropy": 2.092151641845703, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3130563199520111, "step": 1462 }, { "epoch": 0.0915, "grad_norm": 3.46875, "grad_norm_var": 0.8086100260416667, "learning_rate": 0.0001, "loss": 8.8506, "loss/crossentropy": 2.3982959985733032, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3187362104654312, "step": 1464 }, { "epoch": 0.091625, "grad_norm": 3.96875, "grad_norm_var": 0.8220377604166667, "learning_rate": 0.0001, "loss": 8.9897, "loss/crossentropy": 2.554791212081909, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3084287494421005, "step": 1466 }, { "epoch": 0.09175, "grad_norm": 3.40625, "grad_norm_var": 0.8275065104166667, "learning_rate": 0.0001, "loss": 8.8077, "loss/crossentropy": 2.230105757713318, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30672188103199005, "step": 1468 }, { "epoch": 0.091875, "grad_norm": 3.84375, "grad_norm_var": 0.81285400390625, "learning_rate": 0.0001, "loss": 8.7891, "loss/crossentropy": 2.3420634269714355, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.28118696808815, "step": 1470 }, { "epoch": 0.092, "grad_norm": 3.609375, "grad_norm_var": 0.8321116129557292, "learning_rate": 0.0001, "loss": 8.9525, "loss/crossentropy": 2.473548173904419, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.30480530858039856, "step": 1472 }, { "epoch": 0.092125, "grad_norm": 3.5625, "grad_norm_var": 0.08126627604166667, "learning_rate": 0.0001, "loss": 8.6078, "loss/crossentropy": 2.331430673599243, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28144779801368713, "step": 1474 }, { "epoch": 0.09225, "grad_norm": 3.640625, "grad_norm_var": 0.07432352701822917, "learning_rate": 0.0001, "loss": 8.9628, "loss/crossentropy": 2.106496810913086, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.29021361470222473, "step": 1476 }, { "epoch": 0.092375, "grad_norm": 3.484375, "grad_norm_var": 0.05628153483072917, "learning_rate": 0.0001, "loss": 8.9618, "loss/crossentropy": 2.41057550907135, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3081541657447815, "step": 1478 }, { "epoch": 0.0925, "grad_norm": 3.703125, "grad_norm_var": 0.05603739420572917, "learning_rate": 0.0001, "loss": 8.9632, "loss/crossentropy": 2.481987476348877, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2695133462548256, "step": 1480 }, { "epoch": 0.092625, "grad_norm": 4.6875, "grad_norm_var": 0.16575520833333332, "learning_rate": 0.0001, "loss": 8.9132, "loss/crossentropy": 2.3429198265075684, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30013199150562286, "step": 1482 }, { "epoch": 0.09275, "grad_norm": 3.3125, "grad_norm_var": 0.1684722900390625, "learning_rate": 0.0001, "loss": 8.8029, "loss/crossentropy": 2.6235066652297974, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.32275477051734924, "step": 1484 }, { "epoch": 0.092875, "grad_norm": 3.984375, "grad_norm_var": 0.1758453369140625, "learning_rate": 0.0001, "loss": 9.082, "loss/crossentropy": 2.550568461418152, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.316007137298584, "step": 1486 }, { "epoch": 0.093, "grad_norm": 3.546875, "grad_norm_var": 0.16510009765625, "learning_rate": 0.0001, "loss": 8.8733, "loss/crossentropy": 2.4568541049957275, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3310081660747528, "step": 1488 }, { "epoch": 0.093125, "grad_norm": 3.578125, "grad_norm_var": 0.1637115478515625, "learning_rate": 0.0001, "loss": 8.8808, "loss/crossentropy": 2.5027180910110474, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.32694798707962036, "step": 1490 }, { "epoch": 0.09325, "grad_norm": 3.53125, "grad_norm_var": 0.16510009765625, "learning_rate": 0.0001, "loss": 8.7269, "loss/crossentropy": 2.1558250188827515, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3121803104877472, "step": 1492 }, { "epoch": 0.093375, "grad_norm": 4.53125, "grad_norm_var": 0.21354166666666666, "learning_rate": 0.0001, "loss": 8.996, "loss/crossentropy": 2.4038426876068115, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.31739044189453125, "step": 1494 }, { "epoch": 0.0935, "grad_norm": 3.15625, "grad_norm_var": 0.23286844889322916, "learning_rate": 0.0001, "loss": 8.8094, "loss/crossentropy": 2.419437289237976, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3122180104255676, "step": 1496 }, { "epoch": 0.093625, "grad_norm": 3.3125, "grad_norm_var": 0.115283203125, "learning_rate": 0.0001, "loss": 8.7015, "loss/crossentropy": 2.3764225244522095, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.32906584441661835, "step": 1498 }, { "epoch": 0.09375, "grad_norm": 3.765625, "grad_norm_var": 0.10917867024739583, "learning_rate": 0.0001, "loss": 9.0724, "loss/crossentropy": 2.421535849571228, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2912449240684509, "step": 1500 }, { "epoch": 0.093875, "grad_norm": 3.375, "grad_norm_var": 0.09527587890625, "learning_rate": 0.0001, "loss": 8.7729, "loss/crossentropy": 2.1934632062911987, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.31051623821258545, "step": 1502 }, { "epoch": 0.094, "grad_norm": 3.421875, "grad_norm_var": 0.09427083333333333, "learning_rate": 0.0001, "loss": 8.7316, "loss/crossentropy": 2.3402860164642334, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.290997713804245, "step": 1504 }, { "epoch": 0.094125, "grad_norm": 3.6875, "grad_norm_var": 0.09648335774739583, "learning_rate": 0.0001, "loss": 8.9565, "loss/crossentropy": 2.440617799758911, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.29472315311431885, "step": 1506 }, { "epoch": 0.09425, "grad_norm": 4.46875, "grad_norm_var": 0.15945638020833333, "learning_rate": 0.0001, "loss": 9.1673, "loss/crossentropy": 2.4849953651428223, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.3361028879880905, "step": 1508 }, { "epoch": 0.094375, "grad_norm": 3.59375, "grad_norm_var": 0.10628255208333333, "learning_rate": 0.0001, "loss": 8.907, "loss/crossentropy": 2.424979090690613, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2993798553943634, "step": 1510 }, { "epoch": 0.0945, "grad_norm": 3.5, "grad_norm_var": 0.09318033854166667, "learning_rate": 0.0001, "loss": 9.0442, "loss/crossentropy": 2.418489933013916, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.32049693167209625, "step": 1512 }, { "epoch": 0.094625, "grad_norm": 3.53125, "grad_norm_var": 0.08361002604166666, "learning_rate": 0.0001, "loss": 8.9006, "loss/crossentropy": 2.2703150510787964, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.28213508427143097, "step": 1514 }, { "epoch": 0.09475, "grad_norm": 5.28125, "grad_norm_var": 0.2518717447916667, "learning_rate": 0.0001, "loss": 9.2422, "loss/crossentropy": 2.528243660926819, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3030099719762802, "step": 1516 }, { "epoch": 0.094875, "grad_norm": 3.6875, "grad_norm_var": 0.23137613932291667, "learning_rate": 0.0001, "loss": 9.0565, "loss/crossentropy": 2.2802836894989014, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.30337056517601013, "step": 1518 }, { "epoch": 0.095, "grad_norm": 3.40625, "grad_norm_var": 0.22304585774739583, "learning_rate": 0.0001, "loss": 9.0657, "loss/crossentropy": 2.3259323835372925, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.29459048807621, "step": 1520 }, { "epoch": 0.095125, "grad_norm": 3.96875, "grad_norm_var": 0.22170817057291667, "learning_rate": 0.0001, "loss": 8.9044, "loss/crossentropy": 2.4524760246276855, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2917432487010956, "step": 1522 }, { "epoch": 0.09525, "grad_norm": 3.703125, "grad_norm_var": 0.20701395670572917, "learning_rate": 0.0001, "loss": 8.6662, "loss/crossentropy": 2.002210795879364, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.28943265974521637, "step": 1524 }, { "epoch": 0.095375, "grad_norm": 4.03125, "grad_norm_var": 0.21230061848958334, "learning_rate": 0.0001, "loss": 9.0207, "loss/crossentropy": 2.6163631677627563, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.30554938316345215, "step": 1526 }, { "epoch": 0.0955, "grad_norm": 3.5, "grad_norm_var": 0.21051025390625, "learning_rate": 0.0001, "loss": 9.0829, "loss/crossentropy": 2.327064037322998, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3019147366285324, "step": 1528 }, { "epoch": 0.095625, "grad_norm": 3.84375, "grad_norm_var": 0.21812744140625, "learning_rate": 0.0001, "loss": 8.9037, "loss/crossentropy": 2.4404029846191406, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3117605447769165, "step": 1530 }, { "epoch": 0.09575, "grad_norm": 3.453125, "grad_norm_var": 0.06982421875, "learning_rate": 0.0001, "loss": 8.6937, "loss/crossentropy": 2.1841256618499756, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2961889058351517, "step": 1532 }, { "epoch": 0.095875, "grad_norm": 4.1875, "grad_norm_var": 0.08876546223958333, "learning_rate": 0.0001, "loss": 8.9432, "loss/crossentropy": 2.5015478134155273, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.31438133120536804, "step": 1534 }, { "epoch": 0.096, "grad_norm": 3.515625, "grad_norm_var": 0.08181050618489584, "learning_rate": 0.0001, "loss": 9.0777, "loss/crossentropy": 2.6356844902038574, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3276464343070984, "step": 1536 }, { "epoch": 0.096125, "grad_norm": 3.25, "grad_norm_var": 0.09553629557291667, "learning_rate": 0.0001, "loss": 8.8278, "loss/crossentropy": 2.6288344860076904, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3106349855661392, "step": 1538 }, { "epoch": 0.09625, "grad_norm": 3.65625, "grad_norm_var": 0.08912760416666667, "learning_rate": 0.0001, "loss": 8.8824, "loss/crossentropy": 2.3544297218322754, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.29945333302021027, "step": 1540 }, { "epoch": 0.096375, "grad_norm": 3.671875, "grad_norm_var": 0.0782623291015625, "learning_rate": 0.0001, "loss": 8.8214, "loss/crossentropy": 2.525648593902588, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3259720504283905, "step": 1542 }, { "epoch": 0.0965, "grad_norm": 3.703125, "grad_norm_var": 0.0704498291015625, "learning_rate": 0.0001, "loss": 9.0293, "loss/crossentropy": 2.557410717010498, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3404108136892319, "step": 1544 }, { "epoch": 0.096625, "grad_norm": 3.578125, "grad_norm_var": 0.06096598307291667, "learning_rate": 0.0001, "loss": 8.9852, "loss/crossentropy": 2.4181408882141113, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.3040696382522583, "step": 1546 }, { "epoch": 0.09675, "grad_norm": 3.921875, "grad_norm_var": 0.06634114583333334, "learning_rate": 0.0001, "loss": 8.6302, "loss/crossentropy": 2.328543782234192, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3038486987352371, "step": 1548 }, { "epoch": 0.096875, "grad_norm": 3.75, "grad_norm_var": 0.0479400634765625, "learning_rate": 0.0001, "loss": 8.8259, "loss/crossentropy": 2.2513211965560913, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3101038932800293, "step": 1550 }, { "epoch": 0.097, "grad_norm": 3.609375, "grad_norm_var": 0.0394195556640625, "learning_rate": 0.0001, "loss": 8.7993, "loss/crossentropy": 2.2204794883728027, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.28817546367645264, "step": 1552 }, { "epoch": 0.097125, "grad_norm": 3.4375, "grad_norm_var": 0.0351715087890625, "learning_rate": 0.0001, "loss": 8.9577, "loss/crossentropy": 2.306167244911194, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.31589527428150177, "step": 1554 }, { "epoch": 0.09725, "grad_norm": 3.4375, "grad_norm_var": 0.026102701822916668, "learning_rate": 0.0001, "loss": 8.7825, "loss/crossentropy": 2.485254645347595, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3053087592124939, "step": 1556 }, { "epoch": 0.097375, "grad_norm": 3.484375, "grad_norm_var": 0.037230428059895834, "learning_rate": 0.0001, "loss": 9.0267, "loss/crossentropy": 2.21419095993042, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3267703801393509, "step": 1558 }, { "epoch": 0.0975, "grad_norm": 3.625, "grad_norm_var": 0.036375935872395834, "learning_rate": 0.0001, "loss": 8.9805, "loss/crossentropy": 2.8129303455352783, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.31060926616191864, "step": 1560 }, { "epoch": 0.097625, "grad_norm": 3.390625, "grad_norm_var": 0.03967997233072917, "learning_rate": 0.0001, "loss": 8.8682, "loss/crossentropy": 2.3201346397399902, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3333878219127655, "step": 1562 }, { "epoch": 0.09775, "grad_norm": 3.71875, "grad_norm_var": 0.06048177083333333, "learning_rate": 0.0001, "loss": 8.6913, "loss/crossentropy": 2.4905728101730347, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.3240518271923065, "step": 1564 }, { "epoch": 0.097875, "grad_norm": 3.40625, "grad_norm_var": 0.05533447265625, "learning_rate": 0.0001, "loss": 8.625, "loss/crossentropy": 2.0138303637504578, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2922722101211548, "step": 1566 }, { "epoch": 0.098, "grad_norm": 3.3125, "grad_norm_var": 0.05804036458333333, "learning_rate": 0.0001, "loss": 8.6031, "loss/crossentropy": 2.336915612220764, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3007281422615051, "step": 1568 }, { "epoch": 0.098125, "grad_norm": 3.296875, "grad_norm_var": 0.06008199055989583, "learning_rate": 0.0001, "loss": 8.7958, "loss/crossentropy": 2.304540991783142, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.27709995210170746, "step": 1570 }, { "epoch": 0.09825, "grad_norm": 3.28125, "grad_norm_var": 0.060887654622395836, "learning_rate": 0.0001, "loss": 8.4869, "loss/crossentropy": 2.291286587715149, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29013076424598694, "step": 1572 }, { "epoch": 0.098375, "grad_norm": 3.140625, "grad_norm_var": 0.044798787434895834, "learning_rate": 0.0001, "loss": 8.4662, "loss/crossentropy": 2.4036799669265747, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.290183424949646, "step": 1574 }, { "epoch": 0.0985, "grad_norm": 3.453125, "grad_norm_var": 0.0473052978515625, "learning_rate": 0.0001, "loss": 9.0173, "loss/crossentropy": 2.3124269247055054, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3117297291755676, "step": 1576 }, { "epoch": 0.098625, "grad_norm": 3.453125, "grad_norm_var": 0.0395416259765625, "learning_rate": 0.0001, "loss": 8.6615, "loss/crossentropy": 2.0177338123321533, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.27028243243694305, "step": 1578 }, { "epoch": 0.09875, "grad_norm": 3.46875, "grad_norm_var": 0.018159993489583335, "learning_rate": 0.0001, "loss": 8.9067, "loss/crossentropy": 2.3349932432174683, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.31509193778038025, "step": 1580 }, { "epoch": 0.098875, "grad_norm": 3.515625, "grad_norm_var": 0.0189849853515625, "learning_rate": 0.0001, "loss": 8.7176, "loss/crossentropy": 2.599055767059326, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3019975572824478, "step": 1582 }, { "epoch": 0.099, "grad_norm": 3.515625, "grad_norm_var": 0.026537068684895835, "learning_rate": 0.0001, "loss": 8.8398, "loss/crossentropy": 2.443954348564148, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29715240001678467, "step": 1584 }, { "epoch": 0.099125, "grad_norm": 3.484375, "grad_norm_var": 0.026537068684895835, "learning_rate": 0.0001, "loss": 8.8296, "loss/crossentropy": 2.5043543577194214, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28090426325798035, "step": 1586 }, { "epoch": 0.09925, "grad_norm": 3.0, "grad_norm_var": 0.03758036295572917, "learning_rate": 0.0001, "loss": 8.5364, "loss/crossentropy": 2.267952561378479, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.30116575956344604, "step": 1588 }, { "epoch": 0.099375, "grad_norm": 3.484375, "grad_norm_var": 0.0370025634765625, "learning_rate": 0.0001, "loss": 8.5028, "loss/crossentropy": 2.0941153168678284, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2714923173189163, "step": 1590 }, { "epoch": 0.0995, "grad_norm": 3.484375, "grad_norm_var": 0.031061808268229168, "learning_rate": 0.0001, "loss": 8.9053, "loss/crossentropy": 2.422881245613098, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.31440040469169617, "step": 1592 }, { "epoch": 0.099625, "grad_norm": 3.265625, "grad_norm_var": 0.032404581705729164, "learning_rate": 0.0001, "loss": 8.6802, "loss/crossentropy": 2.4505289793014526, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2994416207075119, "step": 1594 }, { "epoch": 0.09975, "grad_norm": 3.578125, "grad_norm_var": 0.03308817545572917, "learning_rate": 0.0001, "loss": 8.5845, "loss/crossentropy": 2.3204309940338135, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.290659636259079, "step": 1596 }, { "epoch": 0.099875, "grad_norm": 3.5, "grad_norm_var": 0.045751953125, "learning_rate": 0.0001, "loss": 8.8864, "loss/crossentropy": 2.14364230632782, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.302025705575943, "step": 1598 }, { "epoch": 0.1, "grad_norm": 4.53125, "grad_norm_var": 0.11702067057291667, "learning_rate": 0.0001, "loss": 8.9408, "loss/crossentropy": 2.3856927156448364, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2939811646938324, "step": 1600 }, { "epoch": 0.100125, "grad_norm": 4.65625, "grad_norm_var": 0.19981180826822917, "learning_rate": 0.0001, "loss": 8.9318, "loss/crossentropy": 2.393304467201233, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31673501431941986, "step": 1602 }, { "epoch": 0.10025, "grad_norm": 3.609375, "grad_norm_var": 0.17449442545572916, "learning_rate": 0.0001, "loss": 8.8995, "loss/crossentropy": 2.265239119529724, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2922002673149109, "step": 1604 }, { "epoch": 0.100375, "grad_norm": 3.4375, "grad_norm_var": 0.15900065104166666, "learning_rate": 0.0001, "loss": 8.928, "loss/crossentropy": 2.490726351737976, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.320227712392807, "step": 1606 }, { "epoch": 0.1005, "grad_norm": 3.921875, "grad_norm_var": 0.16309305826822917, "learning_rate": 0.0001, "loss": 9.0626, "loss/crossentropy": 2.3851197957992554, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.33952587842941284, "step": 1608 }, { "epoch": 0.100625, "grad_norm": 3.6875, "grad_norm_var": 0.1425445556640625, "learning_rate": 0.0001, "loss": 8.5284, "loss/crossentropy": 2.4831987619400024, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.28591710329055786, "step": 1610 }, { "epoch": 0.10075, "grad_norm": 3.21875, "grad_norm_var": 0.15349833170572916, "learning_rate": 0.0001, "loss": 8.5034, "loss/crossentropy": 2.3684011697769165, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3100173771381378, "step": 1612 }, { "epoch": 0.100875, "grad_norm": 3.703125, "grad_norm_var": 0.15131734212239584, "learning_rate": 0.0001, "loss": 8.6768, "loss/crossentropy": 2.2726497650146484, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2776256799697876, "step": 1614 }, { "epoch": 0.101, "grad_norm": 3.796875, "grad_norm_var": 0.11056315104166667, "learning_rate": 0.0001, "loss": 8.7838, "loss/crossentropy": 2.2839853763580322, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28534410893917084, "step": 1616 }, { "epoch": 0.101125, "grad_norm": 3.171875, "grad_norm_var": 0.05589090983072917, "learning_rate": 0.0001, "loss": 8.768, "loss/crossentropy": 2.6459089517593384, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.31302425265312195, "step": 1618 }, { "epoch": 0.10125, "grad_norm": 3.625, "grad_norm_var": 0.05469462076822917, "learning_rate": 0.0001, "loss": 8.8765, "loss/crossentropy": 2.3666889667510986, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.31573787331581116, "step": 1620 }, { "epoch": 0.101375, "grad_norm": 3.65625, "grad_norm_var": 0.05213114420572917, "learning_rate": 0.0001, "loss": 8.7946, "loss/crossentropy": 2.294891834259033, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.29489198327064514, "step": 1622 }, { "epoch": 0.1015, "grad_norm": 3.359375, "grad_norm_var": 0.050927734375, "learning_rate": 0.0001, "loss": 8.7622, "loss/crossentropy": 2.186626434326172, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3071312755346298, "step": 1624 }, { "epoch": 0.101625, "grad_norm": 3.0625, "grad_norm_var": 0.06525065104166666, "learning_rate": 0.0001, "loss": 8.8552, "loss/crossentropy": 2.3912779092788696, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30470481514930725, "step": 1626 }, { "epoch": 0.10175, "grad_norm": 3.375, "grad_norm_var": 0.06083984375, "learning_rate": 0.0001, "loss": 8.7036, "loss/crossentropy": 2.236059784889221, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.29887452721595764, "step": 1628 }, { "epoch": 0.101875, "grad_norm": 3.390625, "grad_norm_var": 0.04715169270833333, "learning_rate": 0.0001, "loss": 8.7288, "loss/crossentropy": 2.213658332824707, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2943817526102066, "step": 1630 }, { "epoch": 0.102, "grad_norm": 3.703125, "grad_norm_var": 0.05207417805989583, "learning_rate": 0.0001, "loss": 8.8536, "loss/crossentropy": 2.0792142748832703, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3170311152935028, "step": 1632 }, { "epoch": 0.102125, "grad_norm": 3.3125, "grad_norm_var": 0.05028889973958333, "learning_rate": 0.0001, "loss": 8.6659, "loss/crossentropy": 2.2078484296798706, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.3040362149477005, "step": 1634 }, { "epoch": 0.10225, "grad_norm": 3.453125, "grad_norm_var": 0.0458984375, "learning_rate": 0.0001, "loss": 8.7704, "loss/crossentropy": 2.5149965286254883, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2992193400859833, "step": 1636 }, { "epoch": 0.102375, "grad_norm": 3.625, "grad_norm_var": 0.05103759765625, "learning_rate": 0.0001, "loss": 9.0054, "loss/crossentropy": 2.3132543563842773, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28851835429668427, "step": 1638 }, { "epoch": 0.1025, "grad_norm": 3.359375, "grad_norm_var": 0.049702962239583336, "learning_rate": 0.0001, "loss": 8.6654, "loss/crossentropy": 2.3990262746810913, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.286620169878006, "step": 1640 }, { "epoch": 0.102625, "grad_norm": 3.390625, "grad_norm_var": 0.03817952473958333, "learning_rate": 0.0001, "loss": 8.8415, "loss/crossentropy": 2.5137301683425903, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3091888725757599, "step": 1642 }, { "epoch": 0.10275, "grad_norm": 3.40625, "grad_norm_var": 0.0311676025390625, "learning_rate": 0.0001, "loss": 8.641, "loss/crossentropy": 2.312187671661377, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.31714877486228943, "step": 1644 }, { "epoch": 0.102875, "grad_norm": 3.546875, "grad_norm_var": 0.0481597900390625, "learning_rate": 0.0001, "loss": 8.8978, "loss/crossentropy": 2.3664920330047607, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.31635183095932007, "step": 1646 }, { "epoch": 0.103, "grad_norm": 3.359375, "grad_norm_var": 0.04122721354166667, "learning_rate": 0.0001, "loss": 8.6667, "loss/crossentropy": 2.5847058296203613, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2953044921159744, "step": 1648 }, { "epoch": 0.103125, "grad_norm": 3.953125, "grad_norm_var": 0.08193359375, "learning_rate": 0.0001, "loss": 8.9228, "loss/crossentropy": 2.601444363594055, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3334079086780548, "step": 1650 }, { "epoch": 0.10325, "grad_norm": 3.09375, "grad_norm_var": 0.09729715983072916, "learning_rate": 0.0001, "loss": 8.6107, "loss/crossentropy": 2.3851035833358765, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29805073142051697, "step": 1652 }, { "epoch": 0.103375, "grad_norm": 3.390625, "grad_norm_var": 0.10099995930989583, "learning_rate": 0.0001, "loss": 8.7792, "loss/crossentropy": 2.383594036102295, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3409101665019989, "step": 1654 }, { "epoch": 0.1035, "grad_norm": 3.859375, "grad_norm_var": 0.1023834228515625, "learning_rate": 0.0001, "loss": 8.8836, "loss/crossentropy": 2.386256456375122, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31013236939907074, "step": 1656 }, { "epoch": 0.103625, "grad_norm": 3.609375, "grad_norm_var": 0.09819234212239583, "learning_rate": 0.0001, "loss": 8.7058, "loss/crossentropy": 2.4861687421798706, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2857501953840256, "step": 1658 }, { "epoch": 0.10375, "grad_norm": 3.265625, "grad_norm_var": 0.10321858723958334, "learning_rate": 0.0001, "loss": 8.7162, "loss/crossentropy": 2.165019392967224, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2871998995542526, "step": 1660 }, { "epoch": 0.103875, "grad_norm": 3.1875, "grad_norm_var": 0.10045572916666666, "learning_rate": 0.0001, "loss": 8.6825, "loss/crossentropy": 2.3428409099578857, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.30583275854587555, "step": 1662 }, { "epoch": 0.104, "grad_norm": 3.296875, "grad_norm_var": 0.1024810791015625, "learning_rate": 0.0001, "loss": 8.5464, "loss/crossentropy": 2.1279059648513794, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28338921070098877, "step": 1664 }, { "epoch": 0.104125, "grad_norm": 3.296875, "grad_norm_var": 0.05191650390625, "learning_rate": 0.0001, "loss": 8.6361, "loss/crossentropy": 2.109134554862976, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.28193847835063934, "step": 1666 }, { "epoch": 0.10425, "grad_norm": 3.296875, "grad_norm_var": 0.04047749837239583, "learning_rate": 0.0001, "loss": 8.601, "loss/crossentropy": 2.1180429458618164, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2961196005344391, "step": 1668 }, { "epoch": 0.104375, "grad_norm": 3.421875, "grad_norm_var": 0.0576080322265625, "learning_rate": 0.0001, "loss": 8.6396, "loss/crossentropy": 2.313425898551941, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.28072628378868103, "step": 1670 }, { "epoch": 0.1045, "grad_norm": 3.390625, "grad_norm_var": 0.04879150390625, "learning_rate": 0.0001, "loss": 8.8899, "loss/crossentropy": 2.3383524417877197, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3101891279220581, "step": 1672 }, { "epoch": 0.104625, "grad_norm": 3.6875, "grad_norm_var": 0.05182291666666667, "learning_rate": 0.0001, "loss": 8.6569, "loss/crossentropy": 2.2282902002334595, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.27700501680374146, "step": 1674 }, { "epoch": 0.10475, "grad_norm": 3.546875, "grad_norm_var": 0.05043843587239583, "learning_rate": 0.0001, "loss": 8.8456, "loss/crossentropy": 2.1871705651283264, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.32123811542987823, "step": 1676 }, { "epoch": 0.104875, "grad_norm": 3.515625, "grad_norm_var": 0.04081929524739583, "learning_rate": 0.0001, "loss": 8.7159, "loss/crossentropy": 2.4735978841781616, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3010915666818619, "step": 1678 }, { "epoch": 0.105, "grad_norm": 3.3125, "grad_norm_var": 0.04506734212239583, "learning_rate": 0.0001, "loss": 8.7137, "loss/crossentropy": 2.4544039964675903, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2948637306690216, "step": 1680 }, { "epoch": 0.105125, "grad_norm": 3.265625, "grad_norm_var": 0.044661458333333334, "learning_rate": 0.0001, "loss": 8.8799, "loss/crossentropy": 2.5841368436813354, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3055508881807327, "step": 1682 }, { "epoch": 0.10525, "grad_norm": 3.5625, "grad_norm_var": 0.048859659830729166, "learning_rate": 0.0001, "loss": 8.6805, "loss/crossentropy": 2.195699095726013, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.3213060796260834, "step": 1684 }, { "epoch": 0.105375, "grad_norm": 3.953125, "grad_norm_var": 0.048193359375, "learning_rate": 0.0001, "loss": 8.6492, "loss/crossentropy": 2.3286162614822388, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.30889779329299927, "step": 1686 }, { "epoch": 0.1055, "grad_norm": 3.546875, "grad_norm_var": 0.046956380208333336, "learning_rate": 0.0001, "loss": 8.7456, "loss/crossentropy": 2.3685059547424316, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3166068494319916, "step": 1688 }, { "epoch": 0.105625, "grad_norm": 3.3125, "grad_norm_var": 0.046305338541666664, "learning_rate": 0.0001, "loss": 8.5339, "loss/crossentropy": 2.444363236427307, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.30366671085357666, "step": 1690 }, { "epoch": 0.10575, "grad_norm": 3.4375, "grad_norm_var": 0.042601521809895834, "learning_rate": 0.0001, "loss": 8.7264, "loss/crossentropy": 2.3560056686401367, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.33288004994392395, "step": 1692 }, { "epoch": 0.105875, "grad_norm": 3.28125, "grad_norm_var": 0.044554646809895834, "learning_rate": 0.0001, "loss": 8.5372, "loss/crossentropy": 2.38726007938385, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28503432869911194, "step": 1694 }, { "epoch": 0.106, "grad_norm": 3.3125, "grad_norm_var": 0.040648396809895834, "learning_rate": 0.0001, "loss": 8.5069, "loss/crossentropy": 2.2538931369781494, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2696816474199295, "step": 1696 }, { "epoch": 0.106125, "grad_norm": 3.390625, "grad_norm_var": 0.04129231770833333, "learning_rate": 0.0001, "loss": 8.4485, "loss/crossentropy": 2.3338488340377808, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2753836512565613, "step": 1698 }, { "epoch": 0.10625, "grad_norm": 3.5, "grad_norm_var": 0.03697509765625, "learning_rate": 0.0001, "loss": 8.6194, "loss/crossentropy": 2.346317410469055, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2954401522874832, "step": 1700 }, { "epoch": 0.106375, "grad_norm": 3.265625, "grad_norm_var": 0.020914713541666668, "learning_rate": 0.0001, "loss": 8.684, "loss/crossentropy": 2.4051437377929688, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30432261526584625, "step": 1702 }, { "epoch": 0.1065, "grad_norm": 3.203125, "grad_norm_var": 0.018485514322916667, "learning_rate": 0.0001, "loss": 8.7033, "loss/crossentropy": 2.4638129472732544, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.27916355431079865, "step": 1704 }, { "epoch": 0.106625, "grad_norm": 3.296875, "grad_norm_var": 0.0199127197265625, "learning_rate": 0.0001, "loss": 8.4718, "loss/crossentropy": 2.2523876428604126, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2789776176214218, "step": 1706 }, { "epoch": 0.10675, "grad_norm": 3.34375, "grad_norm_var": 0.024095662434895835, "learning_rate": 0.0001, "loss": 9.0841, "loss/crossentropy": 2.576385736465454, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3490666449069977, "step": 1708 }, { "epoch": 0.106875, "grad_norm": 3.890625, "grad_norm_var": 0.041520182291666666, "learning_rate": 0.0001, "loss": 9.0395, "loss/crossentropy": 2.375891089439392, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.36591947078704834, "step": 1710 }, { "epoch": 0.107, "grad_norm": 3.25, "grad_norm_var": 0.04540608723958333, "learning_rate": 0.0001, "loss": 8.8532, "loss/crossentropy": 2.3545055389404297, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3123166114091873, "step": 1712 }, { "epoch": 0.107125, "grad_norm": 3.40625, "grad_norm_var": 0.04453837076822917, "learning_rate": 0.0001, "loss": 8.5417, "loss/crossentropy": 2.412468910217285, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29608577489852905, "step": 1714 }, { "epoch": 0.10725, "grad_norm": 3.640625, "grad_norm_var": 0.04431864420572917, "learning_rate": 0.0001, "loss": 8.7605, "loss/crossentropy": 2.3699898719787598, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2785623371601105, "step": 1716 }, { "epoch": 0.107375, "grad_norm": 3.765625, "grad_norm_var": 0.04664306640625, "learning_rate": 0.0001, "loss": 9.0555, "loss/crossentropy": 2.4347622394561768, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.32773303985595703, "step": 1718 }, { "epoch": 0.1075, "grad_norm": 3.765625, "grad_norm_var": 0.2630208333333333, "learning_rate": 0.0001, "loss": 8.6447, "loss/crossentropy": 2.3668179512023926, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.31063494086265564, "step": 1720 }, { "epoch": 0.107625, "grad_norm": 3.59375, "grad_norm_var": 0.24821675618489583, "learning_rate": 0.0001, "loss": 8.963, "loss/crossentropy": 2.6537840366363525, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.31972379982471466, "step": 1722 }, { "epoch": 0.10775, "grad_norm": 3.125, "grad_norm_var": 0.26516520182291664, "learning_rate": 0.0001, "loss": 8.2949, "loss/crossentropy": 2.07137930393219, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.27931931614875793, "step": 1724 }, { "epoch": 0.107875, "grad_norm": 3.578125, "grad_norm_var": 0.26011962890625, "learning_rate": 0.0001, "loss": 8.4489, "loss/crossentropy": 2.272459626197815, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.28474678099155426, "step": 1726 }, { "epoch": 0.108, "grad_norm": 3.421875, "grad_norm_var": 0.26783447265625, "learning_rate": 0.0001, "loss": 8.6669, "loss/crossentropy": 2.259164571762085, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2813860774040222, "step": 1728 }, { "epoch": 0.108125, "grad_norm": 3.375, "grad_norm_var": 0.26335347493489586, "learning_rate": 0.0001, "loss": 8.7254, "loss/crossentropy": 2.3241848945617676, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2932712584733963, "step": 1730 }, { "epoch": 0.10825, "grad_norm": 3.46875, "grad_norm_var": 0.26031494140625, "learning_rate": 0.0001, "loss": 8.6529, "loss/crossentropy": 2.1817615032196045, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.28635722398757935, "step": 1732 }, { "epoch": 0.108375, "grad_norm": 3.75, "grad_norm_var": 0.2612864176432292, "learning_rate": 0.0001, "loss": 8.9668, "loss/crossentropy": 2.2815951108932495, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.32627154886722565, "step": 1734 }, { "epoch": 0.1085, "grad_norm": 3.65625, "grad_norm_var": 0.032835896809895834, "learning_rate": 0.0001, "loss": 8.9081, "loss/crossentropy": 2.4856772422790527, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3120778799057007, "step": 1736 }, { "epoch": 0.108625, "grad_norm": 3.125, "grad_norm_var": 0.039061482747395834, "learning_rate": 0.0001, "loss": 8.6702, "loss/crossentropy": 1.900740385055542, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2662765383720398, "step": 1738 }, { "epoch": 0.10875, "grad_norm": 3.515625, "grad_norm_var": 0.031966145833333334, "learning_rate": 0.0001, "loss": 8.5816, "loss/crossentropy": 2.467803955078125, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2834676057100296, "step": 1740 }, { "epoch": 0.108875, "grad_norm": 3.4375, "grad_norm_var": 0.03134358723958333, "learning_rate": 0.0001, "loss": 8.9114, "loss/crossentropy": 2.294146180152893, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.27088499069213867, "step": 1742 }, { "epoch": 0.109, "grad_norm": 3.5, "grad_norm_var": 0.026292928059895835, "learning_rate": 0.0001, "loss": 8.8052, "loss/crossentropy": 2.2657724618911743, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3134308159351349, "step": 1744 }, { "epoch": 0.109125, "grad_norm": 3.78125, "grad_norm_var": 0.03062744140625, "learning_rate": 0.0001, "loss": 8.694, "loss/crossentropy": 2.2271536588668823, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2863340824842453, "step": 1746 }, { "epoch": 0.10925, "grad_norm": 3.359375, "grad_norm_var": 0.030562337239583334, "learning_rate": 0.0001, "loss": 8.711, "loss/crossentropy": 2.3169878721237183, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30684567987918854, "step": 1748 }, { "epoch": 0.109375, "grad_norm": 3.328125, "grad_norm_var": 0.08248697916666667, "learning_rate": 0.0001, "loss": 8.7637, "loss/crossentropy": 2.4109179973602295, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3122059851884842, "step": 1750 }, { "epoch": 0.1095, "grad_norm": 3.5, "grad_norm_var": 0.08864644368489584, "learning_rate": 0.0001, "loss": 8.9595, "loss/crossentropy": 2.1605933904647827, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.3139963746070862, "step": 1752 }, { "epoch": 0.109625, "grad_norm": 3.296875, "grad_norm_var": 0.0794097900390625, "learning_rate": 0.0001, "loss": 8.7469, "loss/crossentropy": 2.427368640899658, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3167697489261627, "step": 1754 }, { "epoch": 0.10975, "grad_norm": 3.359375, "grad_norm_var": 0.0843902587890625, "learning_rate": 0.0001, "loss": 8.6596, "loss/crossentropy": 2.3277207612991333, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.29165390133857727, "step": 1756 }, { "epoch": 0.109875, "grad_norm": 3.515625, "grad_norm_var": 0.07976786295572917, "learning_rate": 0.0001, "loss": 8.8038, "loss/crossentropy": 2.610532522201538, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29831643402576447, "step": 1758 }, { "epoch": 0.11, "grad_norm": 3.859375, "grad_norm_var": 0.0923828125, "learning_rate": 0.0001, "loss": 8.9237, "loss/crossentropy": 2.4981523752212524, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.32519689202308655, "step": 1760 }, { "epoch": 0.110125, "grad_norm": 3.453125, "grad_norm_var": 0.08820699055989584, "learning_rate": 0.0001, "loss": 8.5204, "loss/crossentropy": 2.21122944355011, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2947424352169037, "step": 1762 }, { "epoch": 0.11025, "grad_norm": 3.390625, "grad_norm_var": 0.09097391764322917, "learning_rate": 0.0001, "loss": 8.5618, "loss/crossentropy": 2.4394689798355103, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2784826308488846, "step": 1764 }, { "epoch": 0.110375, "grad_norm": 3.203125, "grad_norm_var": 0.0383941650390625, "learning_rate": 0.0001, "loss": 8.5953, "loss/crossentropy": 2.1292494535446167, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2804698646068573, "step": 1766 }, { "epoch": 0.1105, "grad_norm": 3.609375, "grad_norm_var": 0.02783203125, "learning_rate": 0.0001, "loss": 8.6388, "loss/crossentropy": 2.6741600036621094, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3054506927728653, "step": 1768 }, { "epoch": 0.110625, "grad_norm": 3.703125, "grad_norm_var": 0.031266276041666666, "learning_rate": 0.0001, "loss": 8.6962, "loss/crossentropy": 2.2202726006507874, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.26149123907089233, "step": 1770 }, { "epoch": 0.11075, "grad_norm": 3.703125, "grad_norm_var": 0.033349609375, "learning_rate": 0.0001, "loss": 8.7391, "loss/crossentropy": 2.2986572980880737, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.3005179762840271, "step": 1772 }, { "epoch": 0.110875, "grad_norm": 3.671875, "grad_norm_var": 0.03790690104166667, "learning_rate": 0.0001, "loss": 8.814, "loss/crossentropy": 2.4717568159103394, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2965603917837143, "step": 1774 }, { "epoch": 0.111, "grad_norm": 3.34375, "grad_norm_var": 0.025699869791666666, "learning_rate": 0.0001, "loss": 8.6132, "loss/crossentropy": 2.4182674884796143, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29090292751789093, "step": 1776 }, { "epoch": 0.111125, "grad_norm": 3.484375, "grad_norm_var": 0.028229777018229166, "learning_rate": 0.0001, "loss": 8.7369, "loss/crossentropy": 2.270543932914734, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2845657169818878, "step": 1778 }, { "epoch": 0.11125, "grad_norm": 3.34375, "grad_norm_var": 0.02769775390625, "learning_rate": 0.0001, "loss": 8.6883, "loss/crossentropy": 2.4965745210647583, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.30929645895957947, "step": 1780 }, { "epoch": 0.111375, "grad_norm": 3.3125, "grad_norm_var": 0.025927734375, "learning_rate": 0.0001, "loss": 8.2737, "loss/crossentropy": 1.9217499494552612, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.26199381053447723, "step": 1782 }, { "epoch": 0.1115, "grad_norm": 3.203125, "grad_norm_var": 0.03183186848958333, "learning_rate": 0.0001, "loss": 8.9253, "loss/crossentropy": 2.4939075708389282, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3338964730501175, "step": 1784 }, { "epoch": 0.111625, "grad_norm": 3.171875, "grad_norm_var": 0.03540751139322917, "learning_rate": 0.0001, "loss": 8.4357, "loss/crossentropy": 2.3698354959487915, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28977689146995544, "step": 1786 }, { "epoch": 0.11175, "grad_norm": 4.03125, "grad_norm_var": 0.05469462076822917, "learning_rate": 0.0001, "loss": 8.7838, "loss/crossentropy": 2.4376271963119507, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2988607585430145, "step": 1788 }, { "epoch": 0.111875, "grad_norm": 3.59375, "grad_norm_var": 0.1191070556640625, "learning_rate": 0.0001, "loss": 8.9321, "loss/crossentropy": 2.229245901107788, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2898672968149185, "step": 1790 }, { "epoch": 0.112, "grad_norm": 3.546875, "grad_norm_var": 0.12310791015625, "learning_rate": 0.0001, "loss": 8.6884, "loss/crossentropy": 2.5585025548934937, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.30431005358695984, "step": 1792 }, { "epoch": 0.112125, "grad_norm": 3.15625, "grad_norm_var": 0.12705078125, "learning_rate": 0.0001, "loss": 8.7439, "loss/crossentropy": 2.237270951271057, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29220762848854065, "step": 1794 }, { "epoch": 0.11225, "grad_norm": 3.3125, "grad_norm_var": 0.12760009765625, "learning_rate": 0.0001, "loss": 8.5924, "loss/crossentropy": 2.4742521047592163, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.2950773537158966, "step": 1796 }, { "epoch": 0.112375, "grad_norm": 3.40625, "grad_norm_var": 0.12604878743489584, "learning_rate": 0.0001, "loss": 8.5864, "loss/crossentropy": 2.39210307598114, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.30888137221336365, "step": 1798 }, { "epoch": 0.1125, "grad_norm": 3.625, "grad_norm_var": 0.12141927083333333, "learning_rate": 0.0001, "loss": 8.7044, "loss/crossentropy": 2.3993630409240723, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2913102060556412, "step": 1800 }, { "epoch": 0.112625, "grad_norm": 3.484375, "grad_norm_var": 0.11155192057291667, "learning_rate": 0.0001, "loss": 8.8666, "loss/crossentropy": 2.682582974433899, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.31557750701904297, "step": 1802 }, { "epoch": 0.11275, "grad_norm": 3.328125, "grad_norm_var": 0.0958404541015625, "learning_rate": 0.0001, "loss": 8.4966, "loss/crossentropy": 2.0429012775421143, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26017338037490845, "step": 1804 }, { "epoch": 0.112875, "grad_norm": 3.390625, "grad_norm_var": 0.017455037434895834, "learning_rate": 0.0001, "loss": 8.6763, "loss/crossentropy": 2.3047099113464355, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31745266914367676, "step": 1806 }, { "epoch": 0.113, "grad_norm": 3.3125, "grad_norm_var": 0.0132965087890625, "learning_rate": 0.0001, "loss": 8.4867, "loss/crossentropy": 2.1992413997650146, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.32566145062446594, "step": 1808 }, { "epoch": 0.113125, "grad_norm": 3.75, "grad_norm_var": 0.01890869140625, "learning_rate": 0.0001, "loss": 8.8092, "loss/crossentropy": 2.771738290786743, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.304366871714592, "step": 1810 }, { "epoch": 0.11325, "grad_norm": 3.28125, "grad_norm_var": 0.019245402018229166, "learning_rate": 0.0001, "loss": 8.6466, "loss/crossentropy": 2.1588589549064636, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2743106037378311, "step": 1812 }, { "epoch": 0.113375, "grad_norm": 3.53125, "grad_norm_var": 0.022021484375, "learning_rate": 0.0001, "loss": 8.7871, "loss/crossentropy": 2.143616557121277, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.33632121980190277, "step": 1814 }, { "epoch": 0.1135, "grad_norm": 3.421875, "grad_norm_var": 0.019059244791666666, "learning_rate": 0.0001, "loss": 8.8236, "loss/crossentropy": 2.4321892261505127, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3019208610057831, "step": 1816 }, { "epoch": 0.113625, "grad_norm": 3.15625, "grad_norm_var": 0.022150675455729168, "learning_rate": 0.0001, "loss": 8.6216, "loss/crossentropy": 2.4934128522872925, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30101920664310455, "step": 1818 }, { "epoch": 0.11375, "grad_norm": 3.53125, "grad_norm_var": 0.0219390869140625, "learning_rate": 0.0001, "loss": 8.7396, "loss/crossentropy": 2.309072256088257, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.27716949582099915, "step": 1820 }, { "epoch": 0.113875, "grad_norm": 3.15625, "grad_norm_var": 0.0274566650390625, "learning_rate": 0.0001, "loss": 8.7831, "loss/crossentropy": 2.37872850894928, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2861686944961548, "step": 1822 }, { "epoch": 0.114, "grad_norm": 3.234375, "grad_norm_var": 0.04157613118489583, "learning_rate": 0.0001, "loss": 8.673, "loss/crossentropy": 2.3230080604553223, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2735777944326401, "step": 1824 }, { "epoch": 0.114125, "grad_norm": 3.46875, "grad_norm_var": 0.0411773681640625, "learning_rate": 0.0001, "loss": 8.5909, "loss/crossentropy": 2.421627402305603, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.3164493143558502, "step": 1826 }, { "epoch": 0.11425, "grad_norm": 3.53125, "grad_norm_var": 0.041304524739583334, "learning_rate": 0.0001, "loss": 8.4245, "loss/crossentropy": 2.378306269645691, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28885410726070404, "step": 1828 }, { "epoch": 0.114375, "grad_norm": 3.328125, "grad_norm_var": 0.04429931640625, "learning_rate": 0.0001, "loss": 8.586, "loss/crossentropy": 2.2422314882278442, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28581804037094116, "step": 1830 }, { "epoch": 0.1145, "grad_norm": 3.40625, "grad_norm_var": 0.04399312337239583, "learning_rate": 0.0001, "loss": 8.893, "loss/crossentropy": 2.5272161960601807, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.29680605232715607, "step": 1832 }, { "epoch": 0.114625, "grad_norm": 3.328125, "grad_norm_var": 0.03966471354166667, "learning_rate": 0.0001, "loss": 8.6886, "loss/crossentropy": 2.5110833644866943, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.30536364018917084, "step": 1834 }, { "epoch": 0.11475, "grad_norm": 3.359375, "grad_norm_var": 0.039794921875, "learning_rate": 0.0001, "loss": 8.6081, "loss/crossentropy": 2.2756314277648926, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.329521119594574, "step": 1836 }, { "epoch": 0.114875, "grad_norm": 3.296875, "grad_norm_var": 0.03609619140625, "learning_rate": 0.0001, "loss": 8.7375, "loss/crossentropy": 2.3063724040985107, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28737129271030426, "step": 1838 }, { "epoch": 0.115, "grad_norm": 3.4375, "grad_norm_var": 0.018863932291666666, "learning_rate": 0.0001, "loss": 8.8197, "loss/crossentropy": 2.3004164695739746, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28277695178985596, "step": 1840 }, { "epoch": 0.115125, "grad_norm": 3.140625, "grad_norm_var": 0.023563639322916666, "learning_rate": 0.0001, "loss": 8.7529, "loss/crossentropy": 2.41781747341156, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2941035330295563, "step": 1842 }, { "epoch": 0.11525, "grad_norm": 3.234375, "grad_norm_var": 0.025593058268229166, "learning_rate": 0.0001, "loss": 8.659, "loss/crossentropy": 2.215361475944519, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2741749882698059, "step": 1844 }, { "epoch": 0.115375, "grad_norm": 3.234375, "grad_norm_var": 0.023053995768229165, "learning_rate": 0.0001, "loss": 8.5763, "loss/crossentropy": 2.263062834739685, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2967001497745514, "step": 1846 }, { "epoch": 0.1155, "grad_norm": 3.390625, "grad_norm_var": 0.023160807291666665, "learning_rate": 0.0001, "loss": 8.5134, "loss/crossentropy": 2.3583621978759766, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2846299409866333, "step": 1848 }, { "epoch": 0.115625, "grad_norm": 3.421875, "grad_norm_var": 0.022883097330729168, "learning_rate": 0.0001, "loss": 8.7018, "loss/crossentropy": 2.3313838243484497, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.27793991565704346, "step": 1850 }, { "epoch": 0.11575, "grad_norm": 3.140625, "grad_norm_var": 0.0206695556640625, "learning_rate": 0.0001, "loss": 8.6257, "loss/crossentropy": 2.5213606357574463, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2858508974313736, "step": 1852 }, { "epoch": 0.115875, "grad_norm": 5.125, "grad_norm_var": 0.22066650390625, "learning_rate": 0.0001, "loss": 8.5687, "loss/crossentropy": 2.138124704360962, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2705334722995758, "step": 1854 }, { "epoch": 0.116, "grad_norm": 3.5, "grad_norm_var": 0.22082926432291666, "learning_rate": 0.0001, "loss": 8.692, "loss/crossentropy": 2.190787434577942, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2934059798717499, "step": 1856 }, { "epoch": 0.116125, "grad_norm": 3.8125, "grad_norm_var": 0.21988016764322918, "learning_rate": 0.0001, "loss": 8.9455, "loss/crossentropy": 2.4185396432876587, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2919394373893738, "step": 1858 }, { "epoch": 0.11625, "grad_norm": 3.359375, "grad_norm_var": 0.2108306884765625, "learning_rate": 0.0001, "loss": 8.7029, "loss/crossentropy": 2.4572391510009766, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28716185688972473, "step": 1860 }, { "epoch": 0.116375, "grad_norm": 3.203125, "grad_norm_var": 0.21071675618489583, "learning_rate": 0.0001, "loss": 8.5979, "loss/crossentropy": 2.280961036682129, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.3058573454618454, "step": 1862 }, { "epoch": 0.1165, "grad_norm": 3.15625, "grad_norm_var": 0.22084859212239583, "learning_rate": 0.0001, "loss": 8.4776, "loss/crossentropy": 2.2182366847991943, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2703913003206253, "step": 1864 }, { "epoch": 0.116625, "grad_norm": 3.953125, "grad_norm_var": 0.23896382649739584, "learning_rate": 0.0001, "loss": 8.6861, "loss/crossentropy": 2.3092691898345947, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2828802317380905, "step": 1866 }, { "epoch": 0.11675, "grad_norm": 3.625, "grad_norm_var": 0.2319244384765625, "learning_rate": 0.0001, "loss": 8.7237, "loss/crossentropy": 2.465211868286133, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28346388041973114, "step": 1868 }, { "epoch": 0.116875, "grad_norm": 3.390625, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 8.5473, "loss/crossentropy": 2.289981722831726, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27946069836616516, "step": 1870 }, { "epoch": 0.117, "grad_norm": 3.765625, "grad_norm_var": 0.0533599853515625, "learning_rate": 0.0001, "loss": 8.7206, "loss/crossentropy": 2.3901021480560303, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.27738629281520844, "step": 1872 }, { "epoch": 0.117125, "grad_norm": 3.71875, "grad_norm_var": 0.06018778483072917, "learning_rate": 0.0001, "loss": 8.8517, "loss/crossentropy": 2.143779933452606, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2891843765974045, "step": 1874 }, { "epoch": 0.11725, "grad_norm": 3.390625, "grad_norm_var": 0.0638092041015625, "learning_rate": 0.0001, "loss": 8.7417, "loss/crossentropy": 2.1701208353042603, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.32332536578178406, "step": 1876 }, { "epoch": 0.117375, "grad_norm": 4.0, "grad_norm_var": 0.07613932291666667, "learning_rate": 0.0001, "loss": 8.5934, "loss/crossentropy": 2.075779378414154, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2908122092485428, "step": 1878 }, { "epoch": 0.1175, "grad_norm": 3.296875, "grad_norm_var": 0.06750895182291666, "learning_rate": 0.0001, "loss": 8.509, "loss/crossentropy": 2.402305006980896, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.30343984067440033, "step": 1880 }, { "epoch": 0.117625, "grad_norm": 3.109375, "grad_norm_var": 0.06417643229166667, "learning_rate": 0.0001, "loss": 8.712, "loss/crossentropy": 2.4394556283950806, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.30280593037605286, "step": 1882 }, { "epoch": 0.11775, "grad_norm": 3.046875, "grad_norm_var": 0.07830301920572917, "learning_rate": 0.0001, "loss": 8.3853, "loss/crossentropy": 2.2950029373168945, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.25343556702136993, "step": 1884 }, { "epoch": 0.117875, "grad_norm": 3.140625, "grad_norm_var": 0.08426005045572917, "learning_rate": 0.0001, "loss": 8.7969, "loss/crossentropy": 2.3692102432250977, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2964879274368286, "step": 1886 }, { "epoch": 0.118, "grad_norm": 3.484375, "grad_norm_var": 0.0810546875, "learning_rate": 0.0001, "loss": 8.4742, "loss/crossentropy": 2.475126624107361, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.3013022840023041, "step": 1888 }, { "epoch": 0.118125, "grad_norm": 3.4375, "grad_norm_var": 0.0604156494140625, "learning_rate": 0.0001, "loss": 8.5953, "loss/crossentropy": 2.4823808670043945, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27870962023735046, "step": 1890 }, { "epoch": 0.11825, "grad_norm": 3.125, "grad_norm_var": 0.062474568684895836, "learning_rate": 0.0001, "loss": 8.3978, "loss/crossentropy": 2.3761686086654663, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.3016834706068039, "step": 1892 }, { "epoch": 0.118375, "grad_norm": 3.28125, "grad_norm_var": 0.13007710774739584, "learning_rate": 0.0001, "loss": 8.7529, "loss/crossentropy": 2.1992992162704468, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3106433153152466, "step": 1894 }, { "epoch": 0.1185, "grad_norm": 3.890625, "grad_norm_var": 1.60845947265625, "learning_rate": 0.0001, "loss": 9.032, "loss/crossentropy": 2.3841880559921265, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.32507331669330597, "step": 1896 }, { "epoch": 0.118625, "grad_norm": 3.296875, "grad_norm_var": 1.5833943684895833, "learning_rate": 0.0001, "loss": 8.7286, "loss/crossentropy": 2.397653102874756, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.30125096440315247, "step": 1898 }, { "epoch": 0.11875, "grad_norm": 3.453125, "grad_norm_var": 1.5594309488932292, "learning_rate": 0.0001, "loss": 8.6239, "loss/crossentropy": 2.342803478240967, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2870272248983383, "step": 1900 }, { "epoch": 0.118875, "grad_norm": 3.375, "grad_norm_var": 1.5496815999348958, "learning_rate": 0.0001, "loss": 8.755, "loss/crossentropy": 2.4685518741607666, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.30882811546325684, "step": 1902 }, { "epoch": 0.119, "grad_norm": 3.65625, "grad_norm_var": 1.5260050455729166, "learning_rate": 0.0001, "loss": 8.7347, "loss/crossentropy": 2.0706852674484253, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.29542240500450134, "step": 1904 }, { "epoch": 0.119125, "grad_norm": 4.5, "grad_norm_var": 1.541087849934896, "learning_rate": 0.0001, "loss": 8.7724, "loss/crossentropy": 2.156776189804077, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2645677626132965, "step": 1906 }, { "epoch": 0.11925, "grad_norm": 3.4375, "grad_norm_var": 1.529638671875, "learning_rate": 0.0001, "loss": 8.5694, "loss/crossentropy": 2.1738698482513428, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28860998153686523, "step": 1908 }, { "epoch": 0.119375, "grad_norm": 3.421875, "grad_norm_var": 1.4942779541015625, "learning_rate": 0.0001, "loss": 8.8526, "loss/crossentropy": 2.407730460166931, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.32247424125671387, "step": 1910 }, { "epoch": 0.1195, "grad_norm": 3.484375, "grad_norm_var": 0.09680074055989583, "learning_rate": 0.0001, "loss": 8.6226, "loss/crossentropy": 2.0718756914138794, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2795727103948593, "step": 1912 }, { "epoch": 0.119625, "grad_norm": 3.109375, "grad_norm_var": 0.0980377197265625, "learning_rate": 0.0001, "loss": 8.6584, "loss/crossentropy": 2.28408420085907, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.30408230423927307, "step": 1914 }, { "epoch": 0.11975, "grad_norm": 3.015625, "grad_norm_var": 0.1058990478515625, "learning_rate": 0.0001, "loss": 8.6341, "loss/crossentropy": 2.3821157217025757, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.30592113733291626, "step": 1916 }, { "epoch": 0.119875, "grad_norm": 3.484375, "grad_norm_var": 0.1066558837890625, "learning_rate": 0.0001, "loss": 8.5097, "loss/crossentropy": 2.4786245822906494, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29681436717510223, "step": 1918 }, { "epoch": 0.12, "grad_norm": 3.296875, "grad_norm_var": 0.105810546875, "learning_rate": 0.0001, "loss": 8.8635, "loss/crossentropy": 2.56216299533844, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29444751143455505, "step": 1920 }, { "epoch": 0.120125, "grad_norm": 3.1875, "grad_norm_var": 0.025609334309895832, "learning_rate": 0.0001, "loss": 8.7351, "loss/crossentropy": 2.4210604429244995, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27522487938404083, "step": 1922 }, { "epoch": 0.12025, "grad_norm": 3.3125, "grad_norm_var": 0.024323527018229166, "learning_rate": 0.0001, "loss": 8.4784, "loss/crossentropy": 2.239919900894165, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28240087628364563, "step": 1924 }, { "epoch": 0.120375, "grad_norm": 3.53125, "grad_norm_var": 0.024608357747395834, "learning_rate": 0.0001, "loss": 8.756, "loss/crossentropy": 2.387734532356262, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.32171347737312317, "step": 1926 }, { "epoch": 0.1205, "grad_norm": 3.34375, "grad_norm_var": 0.026854451497395834, "learning_rate": 0.0001, "loss": 8.3875, "loss/crossentropy": 2.3138844966888428, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2761044204235077, "step": 1928 }, { "epoch": 0.120625, "grad_norm": 3.171875, "grad_norm_var": 0.024250284830729166, "learning_rate": 0.0001, "loss": 8.8666, "loss/crossentropy": 2.3094791173934937, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.308842197060585, "step": 1930 }, { "epoch": 0.12075, "grad_norm": 3.453125, "grad_norm_var": 0.028807576497395834, "learning_rate": 0.0001, "loss": 8.5039, "loss/crossentropy": 2.2231727838516235, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28627289831638336, "step": 1932 }, { "epoch": 0.120875, "grad_norm": 3.390625, "grad_norm_var": 0.0268218994140625, "learning_rate": 0.0001, "loss": 8.6522, "loss/crossentropy": 2.6999902725219727, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.287996843457222, "step": 1934 }, { "epoch": 0.121, "grad_norm": 3.328125, "grad_norm_var": 0.0267486572265625, "learning_rate": 0.0001, "loss": 8.4122, "loss/crossentropy": 2.383345127105713, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2961116135120392, "step": 1936 }, { "epoch": 0.121125, "grad_norm": 3.3125, "grad_norm_var": 0.0250152587890625, "learning_rate": 0.0001, "loss": 8.6214, "loss/crossentropy": 2.291685461997986, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.27680595219135284, "step": 1938 }, { "epoch": 0.12125, "grad_norm": 3.34375, "grad_norm_var": 0.024828084309895835, "learning_rate": 0.0001, "loss": 8.8914, "loss/crossentropy": 2.4618613719940186, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.3022569268941879, "step": 1940 }, { "epoch": 0.121375, "grad_norm": 3.234375, "grad_norm_var": 0.022606404622395833, "learning_rate": 0.0001, "loss": 8.5706, "loss/crossentropy": 2.3783398866653442, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.3050181269645691, "step": 1942 }, { "epoch": 0.1215, "grad_norm": 3.53125, "grad_norm_var": 0.0193511962890625, "learning_rate": 0.0001, "loss": 8.6295, "loss/crossentropy": 2.1171228885650635, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26293374598026276, "step": 1944 }, { "epoch": 0.121625, "grad_norm": 4.6875, "grad_norm_var": 1.4903310139973958, "learning_rate": 0.0001, "loss": 8.7994, "loss/crossentropy": 2.255262017250061, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.3071945011615753, "step": 1946 }, { "epoch": 0.12175, "grad_norm": 3.53125, "grad_norm_var": 1.4472819010416667, "learning_rate": 0.0001, "loss": 8.4087, "loss/crossentropy": 2.3733495473861694, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28868359327316284, "step": 1948 }, { "epoch": 0.121875, "grad_norm": 3.4375, "grad_norm_var": 1.4477701822916667, "learning_rate": 0.0001, "loss": 8.6656, "loss/crossentropy": 2.264806866645813, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2763102054595947, "step": 1950 }, { "epoch": 0.122, "grad_norm": 3.1875, "grad_norm_var": 1.4537394205729166, "learning_rate": 0.0001, "loss": 8.4854, "loss/crossentropy": 2.217471718788147, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2726486176252365, "step": 1952 }, { "epoch": 0.122125, "grad_norm": 3.1875, "grad_norm_var": 1.4633951822916667, "learning_rate": 0.0001, "loss": 8.8987, "loss/crossentropy": 2.348217725753784, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.3073858618736267, "step": 1954 }, { "epoch": 0.12225, "grad_norm": 3.15625, "grad_norm_var": 1.4752604166666667, "learning_rate": 0.0001, "loss": 8.5881, "loss/crossentropy": 2.3927940130233765, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2712845206260681, "step": 1956 }, { "epoch": 0.122375, "grad_norm": 3.53125, "grad_norm_var": 1.4529693603515625, "learning_rate": 0.0001, "loss": 8.5416, "loss/crossentropy": 2.251898407936096, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.269486665725708, "step": 1958 }, { "epoch": 0.1225, "grad_norm": 3.21875, "grad_norm_var": 1.4741363525390625, "learning_rate": 0.0001, "loss": 8.558, "loss/crossentropy": 2.247495174407959, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2935321629047394, "step": 1960 }, { "epoch": 0.122625, "grad_norm": 5.0, "grad_norm_var": 0.20015869140625, "learning_rate": 0.0001, "loss": 8.6442, "loss/crossentropy": 2.0654982328414917, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2634906470775604, "step": 1962 }, { "epoch": 0.12275, "grad_norm": 3.375, "grad_norm_var": 0.21668192545572917, "learning_rate": 0.0001, "loss": 8.4305, "loss/crossentropy": 2.16032737493515, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28428590297698975, "step": 1964 }, { "epoch": 0.122875, "grad_norm": 3.1875, "grad_norm_var": 0.22750244140625, "learning_rate": 0.0001, "loss": 8.3857, "loss/crossentropy": 2.5656957626342773, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.3001856654882431, "step": 1966 }, { "epoch": 0.123, "grad_norm": 3.46875, "grad_norm_var": 0.22258199055989583, "learning_rate": 0.0001, "loss": 8.5005, "loss/crossentropy": 2.350658416748047, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.29954949021339417, "step": 1968 }, { "epoch": 0.123125, "grad_norm": 3.390625, "grad_norm_var": 0.217919921875, "learning_rate": 0.0001, "loss": 8.7335, "loss/crossentropy": 2.528357982635498, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.3156994730234146, "step": 1970 }, { "epoch": 0.12325, "grad_norm": 3.515625, "grad_norm_var": 0.20797119140625, "learning_rate": 0.0001, "loss": 8.5015, "loss/crossentropy": 2.491376519203186, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29096619784832, "step": 1972 }, { "epoch": 0.123375, "grad_norm": 3.171875, "grad_norm_var": 0.21378580729166666, "learning_rate": 0.0001, "loss": 8.624, "loss/crossentropy": 2.5760600566864014, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.30827219784259796, "step": 1974 }, { "epoch": 0.1235, "grad_norm": 3.546875, "grad_norm_var": 0.21451822916666666, "learning_rate": 0.0001, "loss": 8.588, "loss/crossentropy": 2.3280161023139954, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29429638385772705, "step": 1976 }, { "epoch": 0.123625, "grad_norm": 3.3125, "grad_norm_var": 0.044367472330729164, "learning_rate": 0.0001, "loss": 8.578, "loss/crossentropy": 2.5439298152923584, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.30466994643211365, "step": 1978 }, { "epoch": 0.12375, "grad_norm": 3.140625, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 8.3857, "loss/crossentropy": 2.264935255050659, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28621000051498413, "step": 1980 }, { "epoch": 0.123875, "grad_norm": 3.375, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 8.5949, "loss/crossentropy": 2.4394866228103638, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.28367021679878235, "step": 1982 }, { "epoch": 0.124, "grad_norm": 3.09375, "grad_norm_var": 0.03601786295572917, "learning_rate": 0.0001, "loss": 8.6264, "loss/crossentropy": 2.1645785570144653, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.32053497433662415, "step": 1984 }, { "epoch": 0.124125, "grad_norm": 3.125, "grad_norm_var": 0.04356180826822917, "learning_rate": 0.0001, "loss": 8.283, "loss/crossentropy": 2.332149863243103, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2783034145832062, "step": 1986 }, { "epoch": 0.12425, "grad_norm": 3.125, "grad_norm_var": 0.0423492431640625, "learning_rate": 0.0001, "loss": 8.419, "loss/crossentropy": 2.2908122539520264, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.26500867307186127, "step": 1988 }, { "epoch": 0.124375, "grad_norm": 3.046875, "grad_norm_var": 0.0461334228515625, "learning_rate": 0.0001, "loss": 8.4965, "loss/crossentropy": 2.2126917839050293, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2839796543121338, "step": 1990 }, { "epoch": 0.1245, "grad_norm": 4.09375, "grad_norm_var": 0.47056884765625, "learning_rate": 0.0001, "loss": 8.9506, "loss/crossentropy": 2.355017066001892, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28324250876903534, "step": 1992 }, { "epoch": 0.124625, "grad_norm": 3.15625, "grad_norm_var": 0.47298075358072916, "learning_rate": 0.0001, "loss": 8.5804, "loss/crossentropy": 2.408555507659912, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.28895722329616547, "step": 1994 }, { "epoch": 0.12475, "grad_norm": 3.046875, "grad_norm_var": 0.4876129150390625, "learning_rate": 0.0001, "loss": 8.2851, "loss/crossentropy": 2.251736283302307, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2749274671077728, "step": 1996 }, { "epoch": 0.124875, "grad_norm": 3.578125, "grad_norm_var": 0.4910634358723958, "learning_rate": 0.0001, "loss": 8.6195, "loss/crossentropy": 2.457157015800476, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.31056058406829834, "step": 1998 }, { "epoch": 0.125, "grad_norm": 3.4375, "grad_norm_var": 0.47468973795572916, "learning_rate": 0.0001, "loss": 8.3637, "loss/crossentropy": 2.2277281284332275, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2744043469429016, "step": 2000 }, { "epoch": 0.125125, "grad_norm": 3.28125, "grad_norm_var": 0.45701497395833335, "learning_rate": 0.0001, "loss": 8.6506, "loss/crossentropy": 2.4241243600845337, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2997867166996002, "step": 2002 }, { "epoch": 0.12525, "grad_norm": 3.296875, "grad_norm_var": 0.45701497395833335, "learning_rate": 0.0001, "loss": 8.5374, "loss/crossentropy": 2.3989826440811157, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2899128496646881, "step": 2004 }, { "epoch": 0.125375, "grad_norm": 3.65625, "grad_norm_var": 0.44111226399739584, "learning_rate": 0.0001, "loss": 8.5511, "loss/crossentropy": 2.326428711414337, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.30725909769535065, "step": 2006 }, { "epoch": 0.1255, "grad_norm": 3.015625, "grad_norm_var": 0.0493560791015625, "learning_rate": 0.0001, "loss": 8.4754, "loss/crossentropy": 2.232123017311096, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26897597312927246, "step": 2008 }, { "epoch": 0.125625, "grad_norm": 3.171875, "grad_norm_var": 0.046240234375, "learning_rate": 0.0001, "loss": 8.5567, "loss/crossentropy": 2.353936553001404, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27618004381656647, "step": 2010 }, { "epoch": 0.12575, "grad_norm": 3.203125, "grad_norm_var": 0.039469401041666664, "learning_rate": 0.0001, "loss": 8.5775, "loss/crossentropy": 1.9717338681221008, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26382866501808167, "step": 2012 }, { "epoch": 0.125875, "grad_norm": 3.203125, "grad_norm_var": 0.023908487955729165, "learning_rate": 0.0001, "loss": 8.5801, "loss/crossentropy": 2.315757632255554, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29249346256256104, "step": 2014 }, { "epoch": 0.126, "grad_norm": 4.71875, "grad_norm_var": 0.7569986979166666, "learning_rate": 0.0001, "loss": 8.9827, "loss/crossentropy": 2.2849130630493164, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.36747997999191284, "step": 2016 }, { "epoch": 0.126125, "grad_norm": 3.3125, "grad_norm_var": 0.8076456705729167, "learning_rate": 0.0001, "loss": 8.4695, "loss/crossentropy": 2.2076889276504517, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.33494821190834045, "step": 2018 }, { "epoch": 0.12625, "grad_norm": 3.28125, "grad_norm_var": 0.7920888264973959, "learning_rate": 0.0001, "loss": 8.7315, "loss/crossentropy": 2.2882113456726074, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2940904349088669, "step": 2020 }, { "epoch": 0.126375, "grad_norm": 3.78125, "grad_norm_var": 0.7946451822916667, "learning_rate": 0.0001, "loss": 8.8536, "loss/crossentropy": 2.5705249309539795, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.33114902675151825, "step": 2022 }, { "epoch": 0.1265, "grad_norm": 3.125, "grad_norm_var": 0.78818359375, "learning_rate": 0.0001, "loss": 8.5144, "loss/crossentropy": 2.367907762527466, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.29049575328826904, "step": 2024 }, { "epoch": 0.126625, "grad_norm": 4.34375, "grad_norm_var": 2.528955078125, "learning_rate": 0.0001, "loss": 8.9281, "loss/crossentropy": 2.29294753074646, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.38897041976451874, "step": 2026 }, { "epoch": 0.12675, "grad_norm": 3.703125, "grad_norm_var": 2.452977498372396, "learning_rate": 0.0001, "loss": 8.7142, "loss/crossentropy": 2.286558747291565, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.28155122697353363, "step": 2028 }, { "epoch": 0.126875, "grad_norm": 3.46875, "grad_norm_var": 2.3957753499348957, "learning_rate": 0.0001, "loss": 8.5948, "loss/crossentropy": 2.4738941192626953, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2834329903125763, "step": 2030 }, { "epoch": 0.127, "grad_norm": 3.296875, "grad_norm_var": 2.019173177083333, "learning_rate": 0.0001, "loss": 8.7443, "loss/crossentropy": 2.4368330240249634, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2754772901535034, "step": 2032 }, { "epoch": 0.127125, "grad_norm": 3.484375, "grad_norm_var": 1.99468994140625, "learning_rate": 0.0001, "loss": 8.4763, "loss/crossentropy": 2.287817358970642, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.25924333184957504, "step": 2034 }, { "epoch": 0.12725, "grad_norm": 3.28125, "grad_norm_var": 2.008112589518229, "learning_rate": 0.0001, "loss": 8.6018, "loss/crossentropy": 2.2494828701019287, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29782405495643616, "step": 2036 }, { "epoch": 0.127375, "grad_norm": 3.5625, "grad_norm_var": 2.017438761393229, "learning_rate": 0.0001, "loss": 8.6547, "loss/crossentropy": 2.3243712186813354, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.28735925257205963, "step": 2038 }, { "epoch": 0.1275, "grad_norm": 3.875, "grad_norm_var": 1.9649648030598958, "learning_rate": 0.0001, "loss": 8.5979, "loss/crossentropy": 2.205388069152832, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2879514992237091, "step": 2040 }, { "epoch": 0.127625, "grad_norm": 3.546875, "grad_norm_var": 0.0300201416015625, "learning_rate": 0.0001, "loss": 8.7234, "loss/crossentropy": 2.3310853242874146, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.27090397477149963, "step": 2042 }, { "epoch": 0.12775, "grad_norm": 3.609375, "grad_norm_var": 0.0336822509765625, "learning_rate": 0.0001, "loss": 8.6942, "loss/crossentropy": 2.411409616470337, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3128150999546051, "step": 2044 }, { "epoch": 0.127875, "grad_norm": 3.71875, "grad_norm_var": 0.0386871337890625, "learning_rate": 0.0001, "loss": 8.7962, "loss/crossentropy": 2.531530022621155, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2983619421720505, "step": 2046 }, { "epoch": 0.128, "grad_norm": 3.28125, "grad_norm_var": 0.03849995930989583, "learning_rate": 0.0001, "loss": 8.6854, "loss/crossentropy": 2.4569283723831177, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2882058769464493, "step": 2048 }, { "epoch": 0.128125, "grad_norm": 3.15625, "grad_norm_var": 0.051634724934895834, "learning_rate": 0.0001, "loss": 8.5848, "loss/crossentropy": 2.2018297910690308, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27827227115631104, "step": 2050 }, { "epoch": 0.12825, "grad_norm": 3.65625, "grad_norm_var": 0.08240559895833334, "learning_rate": 0.0001, "loss": 9.0058, "loss/crossentropy": 2.3052055835723877, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3264636695384979, "step": 2052 }, { "epoch": 0.128375, "grad_norm": 3.265625, "grad_norm_var": 0.08439839680989583, "learning_rate": 0.0001, "loss": 8.3336, "loss/crossentropy": 2.3923838138580322, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28575557470321655, "step": 2054 }, { "epoch": 0.1285, "grad_norm": 3.296875, "grad_norm_var": 0.0878326416015625, "learning_rate": 0.0001, "loss": 8.513, "loss/crossentropy": 2.1959877014160156, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2940082252025604, "step": 2056 }, { "epoch": 0.128625, "grad_norm": 3.59375, "grad_norm_var": 0.08575846354166666, "learning_rate": 0.0001, "loss": 8.812, "loss/crossentropy": 2.4442625045776367, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.30026645958423615, "step": 2058 }, { "epoch": 0.12875, "grad_norm": 3.375, "grad_norm_var": 0.08087565104166666, "learning_rate": 0.0001, "loss": 8.5372, "loss/crossentropy": 2.319927215576172, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.33365097641944885, "step": 2060 }, { "epoch": 0.128875, "grad_norm": 3.046875, "grad_norm_var": 0.079296875, "learning_rate": 0.0001, "loss": 8.6948, "loss/crossentropy": 2.263151526451111, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.3065057694911957, "step": 2062 }, { "epoch": 0.129, "grad_norm": 3.109375, "grad_norm_var": 0.08463541666666667, "learning_rate": 0.0001, "loss": 8.4387, "loss/crossentropy": 2.470115303993225, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29933932423591614, "step": 2064 }, { "epoch": 0.129125, "grad_norm": 3.0625, "grad_norm_var": 0.08762613932291667, "learning_rate": 0.0001, "loss": 8.6413, "loss/crossentropy": 2.356273889541626, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2907129377126694, "step": 2066 }, { "epoch": 0.12925, "grad_norm": 3.625, "grad_norm_var": 0.034505208333333336, "learning_rate": 0.0001, "loss": 8.7037, "loss/crossentropy": 2.3621848821640015, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2840597927570343, "step": 2068 }, { "epoch": 0.129375, "grad_norm": 3.5, "grad_norm_var": 0.03486226399739583, "learning_rate": 0.0001, "loss": 8.5151, "loss/crossentropy": 2.034373462200165, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2726925313472748, "step": 2070 }, { "epoch": 0.1295, "grad_norm": 3.515625, "grad_norm_var": 0.034016927083333336, "learning_rate": 0.0001, "loss": 8.6894, "loss/crossentropy": 2.224942922592163, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2749377191066742, "step": 2072 }, { "epoch": 0.129625, "grad_norm": 3.0, "grad_norm_var": 0.04013671875, "learning_rate": 0.0001, "loss": 8.7369, "loss/crossentropy": 2.385498046875, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28188909590244293, "step": 2074 }, { "epoch": 0.12975, "grad_norm": 3.390625, "grad_norm_var": 0.04454752604166667, "learning_rate": 0.0001, "loss": 8.79, "loss/crossentropy": 2.5538192987442017, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.301289901137352, "step": 2076 }, { "epoch": 0.129875, "grad_norm": 3.203125, "grad_norm_var": 0.04350484212239583, "learning_rate": 0.0001, "loss": 8.5047, "loss/crossentropy": 2.343226909637451, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.30614304542541504, "step": 2078 }, { "epoch": 0.13, "grad_norm": 3.5, "grad_norm_var": 0.04088541666666667, "learning_rate": 0.0001, "loss": 8.5874, "loss/crossentropy": 2.5897929668426514, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.29140761494636536, "step": 2080 }, { "epoch": 0.130125, "grad_norm": 3.1875, "grad_norm_var": 0.0357330322265625, "learning_rate": 0.0001, "loss": 8.4543, "loss/crossentropy": 2.3254483938217163, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.27507585287094116, "step": 2082 }, { "epoch": 0.13025, "grad_norm": 3.359375, "grad_norm_var": 0.03479817708333333, "learning_rate": 0.0001, "loss": 8.5021, "loss/crossentropy": 2.230491876602173, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2707635313272476, "step": 2084 }, { "epoch": 0.130375, "grad_norm": 3.328125, "grad_norm_var": 0.0332183837890625, "learning_rate": 0.0001, "loss": 8.355, "loss/crossentropy": 2.4114824533462524, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28432922065258026, "step": 2086 }, { "epoch": 0.1305, "grad_norm": 3.15625, "grad_norm_var": 0.0313873291015625, "learning_rate": 0.0001, "loss": 8.5269, "loss/crossentropy": 2.230819344520569, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26765523850917816, "step": 2088 }, { "epoch": 0.130625, "grad_norm": 3.203125, "grad_norm_var": 0.0262115478515625, "learning_rate": 0.0001, "loss": 8.4321, "loss/crossentropy": 2.4839935302734375, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2982227951288223, "step": 2090 }, { "epoch": 0.13075, "grad_norm": 3.5, "grad_norm_var": 0.020796712239583334, "learning_rate": 0.0001, "loss": 8.5413, "loss/crossentropy": 2.4406583309173584, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.27541063725948334, "step": 2092 }, { "epoch": 0.130875, "grad_norm": 3.625, "grad_norm_var": 0.0326080322265625, "learning_rate": 0.0001, "loss": 8.3994, "loss/crossentropy": 2.193255662918091, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2544540911912918, "step": 2094 }, { "epoch": 0.131, "grad_norm": 3.375, "grad_norm_var": 0.026927693684895834, "learning_rate": 0.0001, "loss": 8.6152, "loss/crossentropy": 2.3712345361709595, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.27534911036491394, "step": 2096 }, { "epoch": 0.131125, "grad_norm": 3.21875, "grad_norm_var": 0.036454264322916666, "learning_rate": 0.0001, "loss": 8.4218, "loss/crossentropy": 2.301249861717224, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2652607709169388, "step": 2098 }, { "epoch": 0.13125, "grad_norm": 3.40625, "grad_norm_var": 0.059056599934895836, "learning_rate": 0.0001, "loss": 8.8422, "loss/crossentropy": 2.4641329050064087, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2975671887397766, "step": 2100 }, { "epoch": 0.131375, "grad_norm": 3.328125, "grad_norm_var": 0.05963134765625, "learning_rate": 0.0001, "loss": 8.5845, "loss/crossentropy": 2.577459692955017, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2966485619544983, "step": 2102 }, { "epoch": 0.1315, "grad_norm": 3.0, "grad_norm_var": 0.0670562744140625, "learning_rate": 0.0001, "loss": 8.4679, "loss/crossentropy": 2.3673853874206543, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2907790243625641, "step": 2104 }, { "epoch": 0.131625, "grad_norm": 3.421875, "grad_norm_var": 0.06606343587239584, "learning_rate": 0.0001, "loss": 8.7638, "loss/crossentropy": 2.519857883453369, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30287206172943115, "step": 2106 }, { "epoch": 0.13175, "grad_norm": 3.28125, "grad_norm_var": 0.06646728515625, "learning_rate": 0.0001, "loss": 8.4918, "loss/crossentropy": 2.1893075108528137, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2853415459394455, "step": 2108 }, { "epoch": 0.131875, "grad_norm": 3.421875, "grad_norm_var": 0.054442342122395834, "learning_rate": 0.0001, "loss": 8.3529, "loss/crossentropy": 2.02129727602005, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2583470046520233, "step": 2110 }, { "epoch": 0.132, "grad_norm": 3.0625, "grad_norm_var": 0.06260477701822917, "learning_rate": 0.0001, "loss": 8.2412, "loss/crossentropy": 2.3207201957702637, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2662471830844879, "step": 2112 }, { "epoch": 0.132125, "grad_norm": 3.09375, "grad_norm_var": 0.05496419270833333, "learning_rate": 0.0001, "loss": 8.6733, "loss/crossentropy": 2.3841429948806763, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2923773378133774, "step": 2114 }, { "epoch": 0.13225, "grad_norm": 3.359375, "grad_norm_var": 0.026488240559895834, "learning_rate": 0.0001, "loss": 8.6388, "loss/crossentropy": 2.178789973258972, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27935703098773956, "step": 2116 }, { "epoch": 0.132375, "grad_norm": 3.390625, "grad_norm_var": 0.04003499348958333, "learning_rate": 0.0001, "loss": 8.5069, "loss/crossentropy": 2.1232659816741943, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2767260819673538, "step": 2118 }, { "epoch": 0.1325, "grad_norm": 3.859375, "grad_norm_var": 0.054906209309895836, "learning_rate": 0.0001, "loss": 8.6523, "loss/crossentropy": 2.349829316139221, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2905244827270508, "step": 2120 }, { "epoch": 0.132625, "grad_norm": 3.453125, "grad_norm_var": 0.08715718587239583, "learning_rate": 0.0001, "loss": 8.7333, "loss/crossentropy": 2.5470376014709473, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.3190218657255173, "step": 2122 }, { "epoch": 0.13275, "grad_norm": 3.28125, "grad_norm_var": 0.08217671712239584, "learning_rate": 0.0001, "loss": 8.6398, "loss/crossentropy": 2.436127185821533, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.29636839032173157, "step": 2124 }, { "epoch": 0.132875, "grad_norm": 3.28125, "grad_norm_var": 0.07965494791666666, "learning_rate": 0.0001, "loss": 8.2415, "loss/crossentropy": 2.409112572669983, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2878931760787964, "step": 2126 }, { "epoch": 0.133, "grad_norm": 3.09375, "grad_norm_var": 0.06551106770833333, "learning_rate": 0.0001, "loss": 8.579, "loss/crossentropy": 2.3906946182250977, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27670612931251526, "step": 2128 }, { "epoch": 0.133125, "grad_norm": 3.15625, "grad_norm_var": 0.064599609375, "learning_rate": 0.0001, "loss": 8.4984, "loss/crossentropy": 2.351179838180542, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29204052686691284, "step": 2130 }, { "epoch": 0.13325, "grad_norm": 3.5625, "grad_norm_var": 0.06601460774739583, "learning_rate": 0.0001, "loss": 8.4208, "loss/crossentropy": 2.373140573501587, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27188287675380707, "step": 2132 }, { "epoch": 0.133375, "grad_norm": 3.40625, "grad_norm_var": 0.062353515625, "learning_rate": 0.0001, "loss": 8.5288, "loss/crossentropy": 2.3178237676620483, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2606750875711441, "step": 2134 }, { "epoch": 0.1335, "grad_norm": 3.640625, "grad_norm_var": 0.051493326822916664, "learning_rate": 0.0001, "loss": 8.6286, "loss/crossentropy": 2.5622605085372925, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2858807444572449, "step": 2136 }, { "epoch": 0.133625, "grad_norm": 3.21875, "grad_norm_var": 0.025934855143229168, "learning_rate": 0.0001, "loss": 8.6969, "loss/crossentropy": 2.461153268814087, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2880861461162567, "step": 2138 }, { "epoch": 0.13375, "grad_norm": 3.3125, "grad_norm_var": 0.030370076497395832, "learning_rate": 0.0001, "loss": 8.2534, "loss/crossentropy": 2.104259490966797, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2679433301091194, "step": 2140 }, { "epoch": 0.133875, "grad_norm": 3.21875, "grad_norm_var": 0.0362457275390625, "learning_rate": 0.0001, "loss": 8.6085, "loss/crossentropy": 2.4813841581344604, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29470039904117584, "step": 2142 }, { "epoch": 0.134, "grad_norm": 3.09375, "grad_norm_var": 0.03515625, "learning_rate": 0.0001, "loss": 8.4156, "loss/crossentropy": 2.3401472568511963, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2929167151451111, "step": 2144 }, { "epoch": 0.134125, "grad_norm": 3.09375, "grad_norm_var": 0.037840779622395834, "learning_rate": 0.0001, "loss": 8.5012, "loss/crossentropy": 2.3249677419662476, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2590218484401703, "step": 2146 }, { "epoch": 0.13425, "grad_norm": 3.40625, "grad_norm_var": 0.03340555826822917, "learning_rate": 0.0001, "loss": 8.2595, "loss/crossentropy": 2.1680409908294678, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.258025586605072, "step": 2148 }, { "epoch": 0.134375, "grad_norm": 3.578125, "grad_norm_var": 0.0428131103515625, "learning_rate": 0.0001, "loss": 8.4906, "loss/crossentropy": 2.3455265760421753, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2917974293231964, "step": 2150 }, { "epoch": 0.1345, "grad_norm": 3.359375, "grad_norm_var": 0.0469146728515625, "learning_rate": 0.0001, "loss": 8.7886, "loss/crossentropy": 2.513558268547058, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3155888319015503, "step": 2152 }, { "epoch": 0.134625, "grad_norm": 3.078125, "grad_norm_var": 0.04812825520833333, "learning_rate": 0.0001, "loss": 8.5435, "loss/crossentropy": 2.6064085960388184, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2825528085231781, "step": 2154 }, { "epoch": 0.13475, "grad_norm": 3.234375, "grad_norm_var": 0.06126200358072917, "learning_rate": 0.0001, "loss": 8.3432, "loss/crossentropy": 2.2562918663024902, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.28957146406173706, "step": 2156 }, { "epoch": 0.134875, "grad_norm": 3.265625, "grad_norm_var": 0.05621337890625, "learning_rate": 0.0001, "loss": 8.5618, "loss/crossentropy": 2.3136632442474365, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2644960880279541, "step": 2158 }, { "epoch": 0.135, "grad_norm": 3.125, "grad_norm_var": 0.055394490559895836, "learning_rate": 0.0001, "loss": 8.2383, "loss/crossentropy": 2.1522982120513916, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2998431622982025, "step": 2160 }, { "epoch": 0.135125, "grad_norm": 3.28125, "grad_norm_var": 0.05455322265625, "learning_rate": 0.0001, "loss": 8.5748, "loss/crossentropy": 2.4402201175689697, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2670477479696274, "step": 2162 }, { "epoch": 0.13525, "grad_norm": 3.125, "grad_norm_var": 0.05601806640625, "learning_rate": 0.0001, "loss": 8.3886, "loss/crossentropy": 2.46258282661438, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.293775275349617, "step": 2164 }, { "epoch": 0.135375, "grad_norm": 2.875, "grad_norm_var": 0.0496734619140625, "learning_rate": 0.0001, "loss": 8.3114, "loss/crossentropy": 2.2047289609909058, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2652291804552078, "step": 2166 }, { "epoch": 0.1355, "grad_norm": 3.140625, "grad_norm_var": 0.028218587239583332, "learning_rate": 0.0001, "loss": 8.4665, "loss/crossentropy": 2.1923593282699585, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2693886160850525, "step": 2168 }, { "epoch": 0.135625, "grad_norm": 3.359375, "grad_norm_var": 0.026949055989583335, "learning_rate": 0.0001, "loss": 8.4295, "loss/crossentropy": 2.529042959213257, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2995428442955017, "step": 2170 }, { "epoch": 0.13575, "grad_norm": 3.234375, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 8.4678, "loss/crossentropy": 2.3110212087631226, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2710278108716011, "step": 2172 }, { "epoch": 0.135875, "grad_norm": 3.53125, "grad_norm_var": 0.023298136393229165, "learning_rate": 0.0001, "loss": 8.3845, "loss/crossentropy": 2.3812527656555176, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28374361991882324, "step": 2174 }, { "epoch": 0.136, "grad_norm": 2.9375, "grad_norm_var": 0.030338541666666666, "learning_rate": 0.0001, "loss": 8.4029, "loss/crossentropy": 2.444836735725403, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2983601689338684, "step": 2176 }, { "epoch": 0.136125, "grad_norm": 3.1875, "grad_norm_var": 0.028856404622395835, "learning_rate": 0.0001, "loss": 8.4012, "loss/crossentropy": 2.2179330587387085, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27099834382534027, "step": 2178 }, { "epoch": 0.13625, "grad_norm": 3.203125, "grad_norm_var": 0.0270660400390625, "learning_rate": 0.0001, "loss": 8.1745, "loss/crossentropy": 2.220338225364685, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26341523230075836, "step": 2180 }, { "epoch": 0.136375, "grad_norm": 3.375, "grad_norm_var": 0.0201324462890625, "learning_rate": 0.0001, "loss": 8.5619, "loss/crossentropy": 2.3334707021713257, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2882850170135498, "step": 2182 }, { "epoch": 0.1365, "grad_norm": 3.078125, "grad_norm_var": 0.022639973958333334, "learning_rate": 0.0001, "loss": 8.1465, "loss/crossentropy": 1.9602341055870056, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2532319873571396, "step": 2184 }, { "epoch": 0.136625, "grad_norm": 3.25, "grad_norm_var": 0.0210113525390625, "learning_rate": 0.0001, "loss": 8.4768, "loss/crossentropy": 2.127853035926819, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.3045553117990494, "step": 2186 }, { "epoch": 0.13675, "grad_norm": 3.4375, "grad_norm_var": 0.0257720947265625, "learning_rate": 0.0001, "loss": 8.5223, "loss/crossentropy": 2.3501694202423096, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.274165078997612, "step": 2188 }, { "epoch": 0.136875, "grad_norm": 3.40625, "grad_norm_var": 0.022223917643229167, "learning_rate": 0.0001, "loss": 8.4563, "loss/crossentropy": 2.1404179334640503, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2844165414571762, "step": 2190 }, { "epoch": 0.137, "grad_norm": 3.71875, "grad_norm_var": 0.03388264973958333, "learning_rate": 0.0001, "loss": 8.4094, "loss/crossentropy": 2.245135545730591, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2556656002998352, "step": 2192 }, { "epoch": 0.137125, "grad_norm": 3.078125, "grad_norm_var": 0.03547261555989583, "learning_rate": 0.0001, "loss": 8.7157, "loss/crossentropy": 2.41064453125, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.267337366938591, "step": 2194 }, { "epoch": 0.13725, "grad_norm": 3.234375, "grad_norm_var": 0.03752848307291667, "learning_rate": 0.0001, "loss": 8.5773, "loss/crossentropy": 2.5315933227539062, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.286949098110199, "step": 2196 }, { "epoch": 0.137375, "grad_norm": 2.84375, "grad_norm_var": 0.048111979166666666, "learning_rate": 0.0001, "loss": 8.3222, "loss/crossentropy": 2.3176004886627197, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2734638601541519, "step": 2198 }, { "epoch": 0.1375, "grad_norm": 3.515625, "grad_norm_var": 0.05325113932291667, "learning_rate": 0.0001, "loss": 8.5529, "loss/crossentropy": 2.339990735054016, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27560897171497345, "step": 2200 }, { "epoch": 0.137625, "grad_norm": 3.078125, "grad_norm_var": 0.06060791015625, "learning_rate": 0.0001, "loss": 8.4805, "loss/crossentropy": 2.284912347793579, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2868229001760483, "step": 2202 }, { "epoch": 0.13775, "grad_norm": 3.09375, "grad_norm_var": 0.056452433268229164, "learning_rate": 0.0001, "loss": 8.4745, "loss/crossentropy": 2.400985598564148, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3073076903820038, "step": 2204 }, { "epoch": 0.137875, "grad_norm": 3.21875, "grad_norm_var": 0.05416666666666667, "learning_rate": 0.0001, "loss": 8.3061, "loss/crossentropy": 2.6853994131088257, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.29532191157341003, "step": 2206 }, { "epoch": 0.138, "grad_norm": 3.203125, "grad_norm_var": 0.025739542643229165, "learning_rate": 0.0001, "loss": 8.3561, "loss/crossentropy": 2.3069804906845093, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27918627858161926, "step": 2208 }, { "epoch": 0.138125, "grad_norm": 3.15625, "grad_norm_var": 0.04568684895833333, "learning_rate": 0.0001, "loss": 8.303, "loss/crossentropy": 2.101306200027466, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27222634851932526, "step": 2210 }, { "epoch": 0.13825, "grad_norm": 3.328125, "grad_norm_var": 0.0466461181640625, "learning_rate": 0.0001, "loss": 8.4047, "loss/crossentropy": 2.2453717589378357, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2868994474411011, "step": 2212 }, { "epoch": 0.138375, "grad_norm": 3.1875, "grad_norm_var": 0.03795166015625, "learning_rate": 0.0001, "loss": 8.371, "loss/crossentropy": 2.373469829559326, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26734408736228943, "step": 2214 }, { "epoch": 0.1385, "grad_norm": 3.3125, "grad_norm_var": 0.0298492431640625, "learning_rate": 0.0001, "loss": 8.5017, "loss/crossentropy": 2.28477144241333, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2690521627664566, "step": 2216 }, { "epoch": 0.138625, "grad_norm": 3.34375, "grad_norm_var": 0.025419108072916665, "learning_rate": 0.0001, "loss": 8.4961, "loss/crossentropy": 2.404952049255371, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28461331129074097, "step": 2218 }, { "epoch": 0.13875, "grad_norm": 3.15625, "grad_norm_var": 0.0295806884765625, "learning_rate": 0.0001, "loss": 8.5059, "loss/crossentropy": 2.3059340715408325, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27501142024993896, "step": 2220 }, { "epoch": 0.138875, "grad_norm": 3.03125, "grad_norm_var": 0.032877604166666664, "learning_rate": 0.0001, "loss": 8.5706, "loss/crossentropy": 2.2778221368789673, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2830122113227844, "step": 2222 }, { "epoch": 0.139, "grad_norm": 3.421875, "grad_norm_var": 0.0301666259765625, "learning_rate": 0.0001, "loss": 8.4701, "loss/crossentropy": 2.1778889894485474, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2717326432466507, "step": 2224 }, { "epoch": 0.139125, "grad_norm": 3.25, "grad_norm_var": 0.02222900390625, "learning_rate": 0.0001, "loss": 8.5608, "loss/crossentropy": 2.4517624378204346, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2836722731590271, "step": 2226 }, { "epoch": 0.13925, "grad_norm": 3.453125, "grad_norm_var": 0.02724609375, "learning_rate": 0.0001, "loss": 8.3748, "loss/crossentropy": 2.1878401041030884, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2892114222049713, "step": 2228 }, { "epoch": 0.139375, "grad_norm": 3.09375, "grad_norm_var": 0.02783203125, "learning_rate": 0.0001, "loss": 8.4792, "loss/crossentropy": 2.355382800102234, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.25766345858573914, "step": 2230 }, { "epoch": 0.1395, "grad_norm": 3.59375, "grad_norm_var": 0.035130818684895836, "learning_rate": 0.0001, "loss": 8.5125, "loss/crossentropy": 2.2248635292053223, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2683367282152176, "step": 2232 }, { "epoch": 0.139625, "grad_norm": 3.625, "grad_norm_var": 0.0451080322265625, "learning_rate": 0.0001, "loss": 8.5788, "loss/crossentropy": 2.253064751625061, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2640530467033386, "step": 2234 }, { "epoch": 0.13975, "grad_norm": 3.140625, "grad_norm_var": 0.043797810872395836, "learning_rate": 0.0001, "loss": 8.3916, "loss/crossentropy": 2.373534321784973, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27554066479206085, "step": 2236 }, { "epoch": 0.139875, "grad_norm": 3.140625, "grad_norm_var": 0.04543355305989583, "learning_rate": 0.0001, "loss": 8.5169, "loss/crossentropy": 2.360267996788025, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2985559552907944, "step": 2238 }, { "epoch": 0.14, "grad_norm": 3.0, "grad_norm_var": 0.05829671223958333, "learning_rate": 0.0001, "loss": 8.2996, "loss/crossentropy": 2.340610146522522, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26291558146476746, "step": 2240 }, { "epoch": 0.140125, "grad_norm": 3.296875, "grad_norm_var": 0.052144368489583336, "learning_rate": 0.0001, "loss": 8.592, "loss/crossentropy": 2.3476451635360718, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.30578161776065826, "step": 2242 }, { "epoch": 0.14025, "grad_norm": 3.265625, "grad_norm_var": 0.05040690104166667, "learning_rate": 0.0001, "loss": 8.2486, "loss/crossentropy": 2.257493257522583, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2669792026281357, "step": 2244 }, { "epoch": 0.140375, "grad_norm": 3.03125, "grad_norm_var": 0.05563151041666667, "learning_rate": 0.0001, "loss": 8.2584, "loss/crossentropy": 2.1079065799713135, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2586442455649376, "step": 2246 }, { "epoch": 0.1405, "grad_norm": 3.15625, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 8.3155, "loss/crossentropy": 2.3655279874801636, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27686166763305664, "step": 2248 }, { "epoch": 0.140625, "grad_norm": 3.328125, "grad_norm_var": 0.023860677083333334, "learning_rate": 0.0001, "loss": 8.5854, "loss/crossentropy": 2.2470057010650635, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2807578891515732, "step": 2250 }, { "epoch": 0.14075, "grad_norm": 2.984375, "grad_norm_var": 0.020894368489583332, "learning_rate": 0.0001, "loss": 8.328, "loss/crossentropy": 2.281611919403076, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26320287585258484, "step": 2252 }, { "epoch": 0.140875, "grad_norm": 3.71875, "grad_norm_var": 0.042708333333333334, "learning_rate": 0.0001, "loss": 8.5691, "loss/crossentropy": 2.281616449356079, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2866251766681671, "step": 2254 }, { "epoch": 0.141, "grad_norm": 3.25, "grad_norm_var": 0.039872233072916666, "learning_rate": 0.0001, "loss": 8.6355, "loss/crossentropy": 2.4851930141448975, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2863723188638687, "step": 2256 }, { "epoch": 0.141125, "grad_norm": 3.0625, "grad_norm_var": 0.04378153483072917, "learning_rate": 0.0001, "loss": 8.3547, "loss/crossentropy": 2.2120649814605713, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26930585503578186, "step": 2258 }, { "epoch": 0.14125, "grad_norm": 3.234375, "grad_norm_var": 0.042008463541666666, "learning_rate": 0.0001, "loss": 8.5007, "loss/crossentropy": 2.3971279859542847, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29983802139759064, "step": 2260 }, { "epoch": 0.141375, "grad_norm": 3.328125, "grad_norm_var": 0.03905843098958333, "learning_rate": 0.0001, "loss": 8.5227, "loss/crossentropy": 2.2901759147644043, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2883574813604355, "step": 2262 }, { "epoch": 0.1415, "grad_norm": 3.0625, "grad_norm_var": 0.041478474934895836, "learning_rate": 0.0001, "loss": 8.402, "loss/crossentropy": 2.1983225345611572, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2799525707960129, "step": 2264 }, { "epoch": 0.141625, "grad_norm": 3.296875, "grad_norm_var": 0.038996378580729164, "learning_rate": 0.0001, "loss": 8.7039, "loss/crossentropy": 2.2069029808044434, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2689012736082077, "step": 2266 }, { "epoch": 0.14175, "grad_norm": 3.546875, "grad_norm_var": 0.03759358723958333, "learning_rate": 0.0001, "loss": 8.5396, "loss/crossentropy": 2.430467963218689, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2826516032218933, "step": 2268 }, { "epoch": 0.141875, "grad_norm": 3.0625, "grad_norm_var": 0.022489420572916665, "learning_rate": 0.0001, "loss": 8.2918, "loss/crossentropy": 2.29804265499115, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2685416340827942, "step": 2270 }, { "epoch": 0.142, "grad_norm": 3.1875, "grad_norm_var": 0.020930989583333334, "learning_rate": 0.0001, "loss": 8.4861, "loss/crossentropy": 2.4030654430389404, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26462114602327347, "step": 2272 }, { "epoch": 0.142125, "grad_norm": 3.453125, "grad_norm_var": 0.12026265462239584, "learning_rate": 0.0001, "loss": 8.3658, "loss/crossentropy": 2.0990917682647705, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.29184460639953613, "step": 2274 }, { "epoch": 0.14225, "grad_norm": 3.1875, "grad_norm_var": 0.12245992024739584, "learning_rate": 0.0001, "loss": 8.4349, "loss/crossentropy": 2.258981704711914, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2933773547410965, "step": 2276 }, { "epoch": 0.142375, "grad_norm": 3.53125, "grad_norm_var": 0.1346099853515625, "learning_rate": 0.0001, "loss": 8.5544, "loss/crossentropy": 2.414697289466858, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28715577721595764, "step": 2278 }, { "epoch": 0.1425, "grad_norm": 3.21875, "grad_norm_var": 0.12939453125, "learning_rate": 0.0001, "loss": 8.4856, "loss/crossentropy": 2.343166470527649, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2715238779783249, "step": 2280 }, { "epoch": 0.142625, "grad_norm": 3.09375, "grad_norm_var": 0.13337300618489584, "learning_rate": 0.0001, "loss": 8.4238, "loss/crossentropy": 2.1820271015167236, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26598094403743744, "step": 2282 }, { "epoch": 0.14275, "grad_norm": 2.96875, "grad_norm_var": 0.14114583333333333, "learning_rate": 0.0001, "loss": 8.2716, "loss/crossentropy": 2.481716513633728, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2877477705478668, "step": 2284 }, { "epoch": 0.142875, "grad_norm": 3.078125, "grad_norm_var": 0.14442952473958334, "learning_rate": 0.0001, "loss": 8.2991, "loss/crossentropy": 2.216909646987915, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2711998224258423, "step": 2286 }, { "epoch": 0.143, "grad_norm": 3.34375, "grad_norm_var": 0.14483133951822916, "learning_rate": 0.0001, "loss": 8.2365, "loss/crossentropy": 2.162856936454773, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.28143976628780365, "step": 2288 }, { "epoch": 0.143125, "grad_norm": 3.203125, "grad_norm_var": 0.0439605712890625, "learning_rate": 0.0001, "loss": 8.5854, "loss/crossentropy": 2.5739957094192505, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2899349331855774, "step": 2290 }, { "epoch": 0.14325, "grad_norm": 3.46875, "grad_norm_var": 0.0454742431640625, "learning_rate": 0.0001, "loss": 8.6261, "loss/crossentropy": 2.441213607788086, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.28684163093566895, "step": 2292 }, { "epoch": 0.143375, "grad_norm": 3.359375, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 8.4656, "loss/crossentropy": 2.3305513858795166, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.30575811862945557, "step": 2294 }, { "epoch": 0.1435, "grad_norm": 3.640625, "grad_norm_var": 0.04129231770833333, "learning_rate": 0.0001, "loss": 8.5207, "loss/crossentropy": 2.4758119583129883, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2866526395082474, "step": 2296 }, { "epoch": 0.143625, "grad_norm": 3.046875, "grad_norm_var": 0.043822224934895834, "learning_rate": 0.0001, "loss": 8.308, "loss/crossentropy": 2.5252416133880615, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.29467783868312836, "step": 2298 }, { "epoch": 0.14375, "grad_norm": 2.90625, "grad_norm_var": 0.0426910400390625, "learning_rate": 0.0001, "loss": 8.0802, "loss/crossentropy": 2.417738676071167, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.29233458638191223, "step": 2300 }, { "epoch": 0.143875, "grad_norm": 3.09375, "grad_norm_var": 0.04114176432291667, "learning_rate": 0.0001, "loss": 8.2872, "loss/crossentropy": 2.298841118812561, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.25550100207328796, "step": 2302 }, { "epoch": 0.144, "grad_norm": 3.28125, "grad_norm_var": 0.03951416015625, "learning_rate": 0.0001, "loss": 8.593, "loss/crossentropy": 2.283052444458008, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29309652745723724, "step": 2304 }, { "epoch": 0.144125, "grad_norm": 3.15625, "grad_norm_var": 0.0362457275390625, "learning_rate": 0.0001, "loss": 8.1915, "loss/crossentropy": 2.3673131465911865, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2645817846059799, "step": 2306 }, { "epoch": 0.14425, "grad_norm": 3.125, "grad_norm_var": 0.03322652180989583, "learning_rate": 0.0001, "loss": 8.4168, "loss/crossentropy": 2.3149588108062744, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26110585033893585, "step": 2308 }, { "epoch": 0.144375, "grad_norm": 3.203125, "grad_norm_var": 0.0287506103515625, "learning_rate": 0.0001, "loss": 8.566, "loss/crossentropy": 2.2937344312667847, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2994783967733383, "step": 2310 }, { "epoch": 0.1445, "grad_norm": 3.203125, "grad_norm_var": 0.013459269205729167, "learning_rate": 0.0001, "loss": 8.3584, "loss/crossentropy": 2.4348164796829224, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28557969629764557, "step": 2312 }, { "epoch": 0.144625, "grad_norm": 4.375, "grad_norm_var": 0.10247294108072917, "learning_rate": 0.0001, "loss": 8.1973, "loss/crossentropy": 2.133277177810669, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2594181001186371, "step": 2314 }, { "epoch": 0.14475, "grad_norm": 3.09375, "grad_norm_var": 0.0970123291015625, "learning_rate": 0.0001, "loss": 8.3714, "loss/crossentropy": 2.557259678840637, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.3214830011129379, "step": 2316 }, { "epoch": 0.144875, "grad_norm": 3.09375, "grad_norm_var": 0.0958404541015625, "learning_rate": 0.0001, "loss": 8.2437, "loss/crossentropy": 2.323344588279724, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2699511796236038, "step": 2318 }, { "epoch": 0.145, "grad_norm": 3.046875, "grad_norm_var": 0.10247294108072917, "learning_rate": 0.0001, "loss": 8.3199, "loss/crossentropy": 2.1560362577438354, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2754161208868027, "step": 2320 }, { "epoch": 0.145125, "grad_norm": 3.21875, "grad_norm_var": 0.10204671223958334, "learning_rate": 0.0001, "loss": 8.3976, "loss/crossentropy": 2.1962740421295166, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2777334451675415, "step": 2322 }, { "epoch": 0.14525, "grad_norm": 3.078125, "grad_norm_var": 0.10629781087239583, "learning_rate": 0.0001, "loss": 8.3426, "loss/crossentropy": 2.225161910057068, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2512729838490486, "step": 2324 }, { "epoch": 0.145375, "grad_norm": 3.328125, "grad_norm_var": 0.11122945149739584, "learning_rate": 0.0001, "loss": 8.5134, "loss/crossentropy": 2.286335587501526, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2837376594543457, "step": 2326 }, { "epoch": 0.1455, "grad_norm": 3.3125, "grad_norm_var": 0.11042378743489584, "learning_rate": 0.0001, "loss": 8.7059, "loss/crossentropy": 2.3663827180862427, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3303401321172714, "step": 2328 }, { "epoch": 0.145625, "grad_norm": 3.140625, "grad_norm_var": 0.029313151041666666, "learning_rate": 0.0001, "loss": 8.3792, "loss/crossentropy": 2.273059129714966, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27710796892642975, "step": 2330 }, { "epoch": 0.14575, "grad_norm": 3.078125, "grad_norm_var": 0.027958170572916666, "learning_rate": 0.0001, "loss": 8.4054, "loss/crossentropy": 2.319531798362732, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2798737734556198, "step": 2332 }, { "epoch": 0.145875, "grad_norm": 4.75, "grad_norm_var": 0.19403889973958333, "learning_rate": 0.0001, "loss": 8.7102, "loss/crossentropy": 2.3764997720718384, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.29309070110321045, "step": 2334 }, { "epoch": 0.146, "grad_norm": 3.046875, "grad_norm_var": 0.2710601806640625, "learning_rate": 0.0001, "loss": 8.6281, "loss/crossentropy": 2.339399576187134, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2809606343507767, "step": 2336 }, { "epoch": 0.146125, "grad_norm": 3.203125, "grad_norm_var": 0.27922261555989586, "learning_rate": 0.0001, "loss": 8.4806, "loss/crossentropy": 2.3016566038131714, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25977956503629684, "step": 2338 }, { "epoch": 0.14625, "grad_norm": 3.21875, "grad_norm_var": 0.26383056640625, "learning_rate": 0.0001, "loss": 8.2754, "loss/crossentropy": 2.3781991004943848, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2771807760000229, "step": 2340 }, { "epoch": 0.146375, "grad_norm": 3.453125, "grad_norm_var": 0.26324462890625, "learning_rate": 0.0001, "loss": 8.6139, "loss/crossentropy": 2.496955394744873, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27403801679611206, "step": 2342 }, { "epoch": 0.1465, "grad_norm": 3.0, "grad_norm_var": 0.2773590087890625, "learning_rate": 0.0001, "loss": 8.1782, "loss/crossentropy": 2.2161307334899902, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.26936174929142, "step": 2344 }, { "epoch": 0.146625, "grad_norm": 3.125, "grad_norm_var": 0.29517822265625, "learning_rate": 0.0001, "loss": 8.2396, "loss/crossentropy": 2.3318371772766113, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26794174313545227, "step": 2346 }, { "epoch": 0.14675, "grad_norm": 3.390625, "grad_norm_var": 0.30025126139322916, "learning_rate": 0.0001, "loss": 8.3753, "loss/crossentropy": 2.5500062704086304, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2776283770799637, "step": 2348 }, { "epoch": 0.146875, "grad_norm": 2.96875, "grad_norm_var": 0.14777730305989584, "learning_rate": 0.0001, "loss": 8.1207, "loss/crossentropy": 2.118456542491913, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26154835522174835, "step": 2350 }, { "epoch": 0.147, "grad_norm": 3.265625, "grad_norm_var": 0.03133036295572917, "learning_rate": 0.0001, "loss": 8.3371, "loss/crossentropy": 2.3705456256866455, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28914862871170044, "step": 2352 }, { "epoch": 0.147125, "grad_norm": 3.234375, "grad_norm_var": 0.031086222330729166, "learning_rate": 0.0001, "loss": 8.4854, "loss/crossentropy": 2.214228391647339, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.28034496307373047, "step": 2354 }, { "epoch": 0.14725, "grad_norm": 2.84375, "grad_norm_var": 0.03732096354166667, "learning_rate": 0.0001, "loss": 8.1409, "loss/crossentropy": 2.265252947807312, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2799190580844879, "step": 2356 }, { "epoch": 0.147375, "grad_norm": 3.140625, "grad_norm_var": 0.030159505208333333, "learning_rate": 0.0001, "loss": 8.2196, "loss/crossentropy": 2.523659586906433, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26604168117046356, "step": 2358 }, { "epoch": 0.1475, "grad_norm": 2.953125, "grad_norm_var": 0.031135050455729167, "learning_rate": 0.0001, "loss": 8.2265, "loss/crossentropy": 2.3877971172332764, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2672644555568695, "step": 2360 }, { "epoch": 0.147625, "grad_norm": 2.84375, "grad_norm_var": 0.029564412434895833, "learning_rate": 0.0001, "loss": 8.3976, "loss/crossentropy": 2.3036141395568848, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27549657225608826, "step": 2362 }, { "epoch": 0.14775, "grad_norm": 3.296875, "grad_norm_var": 0.025325520833333334, "learning_rate": 0.0001, "loss": 8.4514, "loss/crossentropy": 2.2630138397216797, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2665908634662628, "step": 2364 }, { "epoch": 0.147875, "grad_norm": 3.421875, "grad_norm_var": 0.03718973795572917, "learning_rate": 0.0001, "loss": 8.6282, "loss/crossentropy": 2.3436743021011353, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27386774867773056, "step": 2366 }, { "epoch": 0.148, "grad_norm": 3.328125, "grad_norm_var": 0.0340728759765625, "learning_rate": 0.0001, "loss": 8.421, "loss/crossentropy": 2.4493263959884644, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.3048429489135742, "step": 2368 }, { "epoch": 0.148125, "grad_norm": 3.0625, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 8.5809, "loss/crossentropy": 2.4388129711151123, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2775990962982178, "step": 2370 }, { "epoch": 0.14825, "grad_norm": 3.0625, "grad_norm_var": 0.030052693684895833, "learning_rate": 0.0001, "loss": 8.3091, "loss/crossentropy": 2.3041138648986816, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2724207639694214, "step": 2372 }, { "epoch": 0.148375, "grad_norm": 3.125, "grad_norm_var": 0.03465067545572917, "learning_rate": 0.0001, "loss": 8.2688, "loss/crossentropy": 1.9960012435913086, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2777027040719986, "step": 2374 }, { "epoch": 0.1485, "grad_norm": 3.140625, "grad_norm_var": 0.050348917643229164, "learning_rate": 0.0001, "loss": 8.513, "loss/crossentropy": 2.154646396636963, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.318150132894516, "step": 2376 }, { "epoch": 0.148625, "grad_norm": 3.03125, "grad_norm_var": 0.04312744140625, "learning_rate": 0.0001, "loss": 8.5843, "loss/crossentropy": 2.460582137107849, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2954748272895813, "step": 2378 }, { "epoch": 0.14875, "grad_norm": 3.34375, "grad_norm_var": 0.051493326822916664, "learning_rate": 0.0001, "loss": 8.3993, "loss/crossentropy": 2.1957290172576904, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2675289958715439, "step": 2380 }, { "epoch": 0.148875, "grad_norm": 3.0625, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 8.4317, "loss/crossentropy": 2.2120442390441895, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30290304124355316, "step": 2382 }, { "epoch": 0.149, "grad_norm": 3.28125, "grad_norm_var": 0.0464752197265625, "learning_rate": 0.0001, "loss": 8.4022, "loss/crossentropy": 2.42188036441803, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2719276398420334, "step": 2384 }, { "epoch": 0.149125, "grad_norm": 3.171875, "grad_norm_var": 0.045947265625, "learning_rate": 0.0001, "loss": 8.5146, "loss/crossentropy": 2.394136667251587, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2908514738082886, "step": 2386 }, { "epoch": 0.14925, "grad_norm": 3.125, "grad_norm_var": 0.043355305989583336, "learning_rate": 0.0001, "loss": 8.2204, "loss/crossentropy": 2.0119369626045227, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2561585605144501, "step": 2388 }, { "epoch": 0.149375, "grad_norm": 3.046875, "grad_norm_var": 0.03824462890625, "learning_rate": 0.0001, "loss": 8.4495, "loss/crossentropy": 2.52905535697937, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29769638180732727, "step": 2390 }, { "epoch": 0.1495, "grad_norm": 3.109375, "grad_norm_var": 0.029715983072916667, "learning_rate": 0.0001, "loss": 8.4764, "loss/crossentropy": 2.2129627466201782, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.3012372702360153, "step": 2392 }, { "epoch": 0.149625, "grad_norm": 3.140625, "grad_norm_var": 0.028270467122395834, "learning_rate": 0.0001, "loss": 8.3592, "loss/crossentropy": 2.3810667991638184, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2853284478187561, "step": 2394 }, { "epoch": 0.14975, "grad_norm": 3.609375, "grad_norm_var": 0.029390462239583335, "learning_rate": 0.0001, "loss": 8.4495, "loss/crossentropy": 2.4027936458587646, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.290962889790535, "step": 2396 }, { "epoch": 0.149875, "grad_norm": 3.046875, "grad_norm_var": 0.029002888997395834, "learning_rate": 0.0001, "loss": 8.3231, "loss/crossentropy": 2.3980143070220947, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2743261456489563, "step": 2398 }, { "epoch": 0.15, "grad_norm": 3.171875, "grad_norm_var": 0.026155598958333335, "learning_rate": 0.0001, "loss": 8.3352, "loss/crossentropy": 2.1707264184951782, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26785067468881607, "step": 2400 }, { "epoch": 0.150125, "grad_norm": 2.953125, "grad_norm_var": 0.030257161458333334, "learning_rate": 0.0001, "loss": 8.5202, "loss/crossentropy": 2.4350671768188477, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.3034633994102478, "step": 2402 }, { "epoch": 0.15025, "grad_norm": 9.5, "grad_norm_var": 2.522652180989583, "learning_rate": 0.0001, "loss": 8.6723, "loss/crossentropy": 2.305592894554138, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27664943039417267, "step": 2404 }, { "epoch": 0.150375, "grad_norm": 3.296875, "grad_norm_var": 2.494429524739583, "learning_rate": 0.0001, "loss": 8.7736, "loss/crossentropy": 2.1731058955192566, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2912386506795883, "step": 2406 }, { "epoch": 0.1505, "grad_norm": 3.421875, "grad_norm_var": 2.4850260416666665, "learning_rate": 0.0001, "loss": 8.4638, "loss/crossentropy": 2.230413794517517, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.29132725298404694, "step": 2408 }, { "epoch": 0.150625, "grad_norm": 3.1875, "grad_norm_var": 2.477733357747396, "learning_rate": 0.0001, "loss": 8.4085, "loss/crossentropy": 2.353387713432312, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27667418122291565, "step": 2410 }, { "epoch": 0.15075, "grad_norm": 3.171875, "grad_norm_var": 2.5046702067057294, "learning_rate": 0.0001, "loss": 8.3084, "loss/crossentropy": 2.2864267826080322, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.29395492374897003, "step": 2412 }, { "epoch": 0.150875, "grad_norm": 3.03125, "grad_norm_var": 2.518973795572917, "learning_rate": 0.0001, "loss": 8.233, "loss/crossentropy": 2.203416109085083, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2798038125038147, "step": 2414 }, { "epoch": 0.151, "grad_norm": 3.21875, "grad_norm_var": 2.5182037353515625, "learning_rate": 0.0001, "loss": 8.5614, "loss/crossentropy": 2.3979218006134033, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2655583620071411, "step": 2416 }, { "epoch": 0.151125, "grad_norm": 2.921875, "grad_norm_var": 2.5204060872395835, "learning_rate": 0.0001, "loss": 8.2764, "loss/crossentropy": 2.2898428440093994, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28126421570777893, "step": 2418 }, { "epoch": 0.15125, "grad_norm": 3.46875, "grad_norm_var": 0.058430989583333336, "learning_rate": 0.0001, "loss": 8.3922, "loss/crossentropy": 2.491398334503174, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.3128269612789154, "step": 2420 }, { "epoch": 0.151375, "grad_norm": 3.265625, "grad_norm_var": 0.022880045572916667, "learning_rate": 0.0001, "loss": 8.3701, "loss/crossentropy": 2.179121255874634, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26591023802757263, "step": 2422 }, { "epoch": 0.1515, "grad_norm": 3.046875, "grad_norm_var": 0.01845703125, "learning_rate": 0.0001, "loss": 8.3354, "loss/crossentropy": 2.288297653198242, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2655438780784607, "step": 2424 }, { "epoch": 0.151625, "grad_norm": 2.96875, "grad_norm_var": 0.020213826497395834, "learning_rate": 0.0001, "loss": 8.362, "loss/crossentropy": 2.2921411991119385, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26359351724386215, "step": 2426 }, { "epoch": 0.15175, "grad_norm": 3.203125, "grad_norm_var": 0.020113118489583335, "learning_rate": 0.0001, "loss": 8.3027, "loss/crossentropy": 2.018544852733612, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27101635932922363, "step": 2428 }, { "epoch": 0.151875, "grad_norm": 3.125, "grad_norm_var": 0.017829386393229167, "learning_rate": 0.0001, "loss": 8.3429, "loss/crossentropy": 2.245858669281006, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2750082165002823, "step": 2430 }, { "epoch": 0.152, "grad_norm": 3.171875, "grad_norm_var": 0.019852701822916666, "learning_rate": 0.0001, "loss": 8.2621, "loss/crossentropy": 2.5454466342926025, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26717574894428253, "step": 2432 }, { "epoch": 0.152125, "grad_norm": 3.140625, "grad_norm_var": 0.01705322265625, "learning_rate": 0.0001, "loss": 8.3405, "loss/crossentropy": 2.361135959625244, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.273520365357399, "step": 2434 }, { "epoch": 0.15225, "grad_norm": 3.0, "grad_norm_var": 0.012581380208333333, "learning_rate": 0.0001, "loss": 8.186, "loss/crossentropy": 2.2724266052246094, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2594418376684189, "step": 2436 }, { "epoch": 0.152375, "grad_norm": 3.171875, "grad_norm_var": 0.011421712239583333, "learning_rate": 0.0001, "loss": 8.4541, "loss/crossentropy": 2.2280519008636475, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26080697774887085, "step": 2438 }, { "epoch": 0.1525, "grad_norm": 3.296875, "grad_norm_var": 0.013016764322916667, "learning_rate": 0.0001, "loss": 8.288, "loss/crossentropy": 2.288727283477783, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2835286557674408, "step": 2440 }, { "epoch": 0.152625, "grad_norm": 3.0, "grad_norm_var": 0.013133748372395834, "learning_rate": 0.0001, "loss": 8.3711, "loss/crossentropy": 2.5871081352233887, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2803485840559006, "step": 2442 }, { "epoch": 0.15275, "grad_norm": 3.015625, "grad_norm_var": 0.015453084309895834, "learning_rate": 0.0001, "loss": 8.2484, "loss/crossentropy": 2.170526623725891, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2835932523012161, "step": 2444 }, { "epoch": 0.152875, "grad_norm": 3.328125, "grad_norm_var": 0.0181793212890625, "learning_rate": 0.0001, "loss": 8.1166, "loss/crossentropy": 2.2848747968673706, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2959897220134735, "step": 2446 }, { "epoch": 0.153, "grad_norm": 3.0625, "grad_norm_var": 0.0361328125, "learning_rate": 0.0001, "loss": 8.4482, "loss/crossentropy": 2.31646192073822, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.3019329309463501, "step": 2448 }, { "epoch": 0.153125, "grad_norm": 2.953125, "grad_norm_var": 0.03801676432291667, "learning_rate": 0.0001, "loss": 8.4538, "loss/crossentropy": 2.2353241443634033, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27827736735343933, "step": 2450 }, { "epoch": 0.15325, "grad_norm": 3.1875, "grad_norm_var": 0.03535054524739583, "learning_rate": 0.0001, "loss": 8.6794, "loss/crossentropy": 2.2519538402557373, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.3021092116832733, "step": 2452 }, { "epoch": 0.153375, "grad_norm": 3.265625, "grad_norm_var": 0.03618062337239583, "learning_rate": 0.0001, "loss": 8.3217, "loss/crossentropy": 2.2183037996292114, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26068025827407837, "step": 2454 }, { "epoch": 0.1535, "grad_norm": 3.296875, "grad_norm_var": 0.04420166015625, "learning_rate": 0.0001, "loss": 8.4633, "loss/crossentropy": 2.500266432762146, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2931359112262726, "step": 2456 }, { "epoch": 0.153625, "grad_norm": 2.984375, "grad_norm_var": 0.04478251139322917, "learning_rate": 0.0001, "loss": 8.436, "loss/crossentropy": 2.303490161895752, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2771891579031944, "step": 2458 }, { "epoch": 0.15375, "grad_norm": 3.265625, "grad_norm_var": 0.03901265462239583, "learning_rate": 0.0001, "loss": 8.5569, "loss/crossentropy": 2.461255431175232, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2784468084573746, "step": 2460 }, { "epoch": 0.153875, "grad_norm": 3.28125, "grad_norm_var": 0.038407389322916666, "learning_rate": 0.0001, "loss": 8.4363, "loss/crossentropy": 2.330198287963867, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.32574698328971863, "step": 2462 }, { "epoch": 0.154, "grad_norm": 3.203125, "grad_norm_var": 0.0226715087890625, "learning_rate": 0.0001, "loss": 8.2665, "loss/crossentropy": 2.2690482139587402, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26994916796684265, "step": 2464 }, { "epoch": 0.154125, "grad_norm": 3.3125, "grad_norm_var": 0.019041951497395834, "learning_rate": 0.0001, "loss": 8.1331, "loss/crossentropy": 2.227648973464966, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2809407413005829, "step": 2466 }, { "epoch": 0.15425, "grad_norm": 3.15625, "grad_norm_var": 0.019038899739583334, "learning_rate": 0.0001, "loss": 8.4534, "loss/crossentropy": 2.4678770303726196, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2576301246881485, "step": 2468 }, { "epoch": 0.154375, "grad_norm": 2.96875, "grad_norm_var": 0.031590779622395836, "learning_rate": 0.0001, "loss": 8.2793, "loss/crossentropy": 2.3544777631759644, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2835593670606613, "step": 2470 }, { "epoch": 0.1545, "grad_norm": 3.0, "grad_norm_var": 0.024641927083333334, "learning_rate": 0.0001, "loss": 8.3369, "loss/crossentropy": 2.3852498531341553, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2661430686712265, "step": 2472 }, { "epoch": 0.154625, "grad_norm": 3.109375, "grad_norm_var": 0.020067342122395835, "learning_rate": 0.0001, "loss": 8.4255, "loss/crossentropy": 2.417587161064148, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26272399723529816, "step": 2474 }, { "epoch": 0.15475, "grad_norm": 2.96875, "grad_norm_var": 0.025633748372395834, "learning_rate": 0.0001, "loss": 8.511, "loss/crossentropy": 2.2773125171661377, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2794167101383209, "step": 2476 }, { "epoch": 0.154875, "grad_norm": 3.0, "grad_norm_var": 0.022956339518229167, "learning_rate": 0.0001, "loss": 8.3734, "loss/crossentropy": 2.395659923553467, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28663066029548645, "step": 2478 }, { "epoch": 0.155, "grad_norm": 3.3125, "grad_norm_var": 0.0287506103515625, "learning_rate": 0.0001, "loss": 8.0888, "loss/crossentropy": 2.0191069841384888, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24243928492069244, "step": 2480 }, { "epoch": 0.155125, "grad_norm": 3.296875, "grad_norm_var": 0.031787109375, "learning_rate": 0.0001, "loss": 8.2903, "loss/crossentropy": 2.2635436058044434, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26046207547187805, "step": 2482 }, { "epoch": 0.15525, "grad_norm": 3.109375, "grad_norm_var": 0.032177734375, "learning_rate": 0.0001, "loss": 8.3064, "loss/crossentropy": 2.3955200910568237, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2639884054660797, "step": 2484 }, { "epoch": 0.155375, "grad_norm": 3.671875, "grad_norm_var": 0.1326812744140625, "learning_rate": 0.0001, "loss": 8.5258, "loss/crossentropy": 2.4589436054229736, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2719157636165619, "step": 2486 }, { "epoch": 0.1555, "grad_norm": 3.078125, "grad_norm_var": 0.13032124837239584, "learning_rate": 0.0001, "loss": 8.1377, "loss/crossentropy": 2.343076705932617, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27290938794612885, "step": 2488 }, { "epoch": 0.155625, "grad_norm": 3.25, "grad_norm_var": 0.131005859375, "learning_rate": 0.0001, "loss": 8.4763, "loss/crossentropy": 2.447250247001648, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2733266055583954, "step": 2490 }, { "epoch": 0.15575, "grad_norm": 3.125, "grad_norm_var": 0.12581278483072916, "learning_rate": 0.0001, "loss": 8.3088, "loss/crossentropy": 2.1297186613082886, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2590889632701874, "step": 2492 }, { "epoch": 0.155875, "grad_norm": 3.09375, "grad_norm_var": 0.12429097493489584, "learning_rate": 0.0001, "loss": 8.5055, "loss/crossentropy": 2.6144137382507324, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27285242080688477, "step": 2494 }, { "epoch": 0.156, "grad_norm": 2.9375, "grad_norm_var": 0.12184956868489584, "learning_rate": 0.0001, "loss": 8.1753, "loss/crossentropy": 2.18227219581604, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2713698595762253, "step": 2496 }, { "epoch": 0.156125, "grad_norm": 3.328125, "grad_norm_var": 0.12237955729166666, "learning_rate": 0.0001, "loss": 8.3438, "loss/crossentropy": 2.5376009941101074, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.29997144639492035, "step": 2498 }, { "epoch": 0.15625, "grad_norm": 3.0625, "grad_norm_var": 0.11913960774739583, "learning_rate": 0.0001, "loss": 8.3455, "loss/crossentropy": 2.3957122564315796, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2771972119808197, "step": 2500 }, { "epoch": 0.156375, "grad_norm": 3.15625, "grad_norm_var": 0.013700358072916667, "learning_rate": 0.0001, "loss": 8.2594, "loss/crossentropy": 2.3807398080825806, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27448034286499023, "step": 2502 }, { "epoch": 0.1565, "grad_norm": 3.1875, "grad_norm_var": 0.013557942708333333, "learning_rate": 0.0001, "loss": 8.3676, "loss/crossentropy": 2.4909543991088867, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2869381755590439, "step": 2504 }, { "epoch": 0.156625, "grad_norm": 2.984375, "grad_norm_var": 0.013248697916666666, "learning_rate": 0.0001, "loss": 8.2354, "loss/crossentropy": 2.4148250818252563, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2610015794634819, "step": 2506 }, { "epoch": 0.15675, "grad_norm": 3.375, "grad_norm_var": 0.014728800455729166, "learning_rate": 0.0001, "loss": 8.3965, "loss/crossentropy": 2.4353911876678467, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26869361102581024, "step": 2508 }, { "epoch": 0.156875, "grad_norm": 3.25, "grad_norm_var": 0.019286092122395834, "learning_rate": 0.0001, "loss": 8.3729, "loss/crossentropy": 2.4266940355300903, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27790483832359314, "step": 2510 }, { "epoch": 0.157, "grad_norm": 3.1875, "grad_norm_var": 0.016630045572916665, "learning_rate": 0.0001, "loss": 8.4859, "loss/crossentropy": 2.544732928276062, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.315563440322876, "step": 2512 }, { "epoch": 0.157125, "grad_norm": 3.34375, "grad_norm_var": 0.016706339518229165, "learning_rate": 0.0001, "loss": 8.3694, "loss/crossentropy": 2.2503061294555664, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.25774821639060974, "step": 2514 }, { "epoch": 0.15725, "grad_norm": 3.453125, "grad_norm_var": 0.020734659830729165, "learning_rate": 0.0001, "loss": 8.1868, "loss/crossentropy": 2.108790874481201, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2711709439754486, "step": 2516 }, { "epoch": 0.157375, "grad_norm": 2.890625, "grad_norm_var": 0.026786295572916667, "learning_rate": 0.0001, "loss": 8.1875, "loss/crossentropy": 2.4312883615493774, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26691682636737823, "step": 2518 }, { "epoch": 0.1575, "grad_norm": 3.375, "grad_norm_var": 0.03139546712239583, "learning_rate": 0.0001, "loss": 8.6367, "loss/crossentropy": 2.4578086137771606, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.29156582057476044, "step": 2520 }, { "epoch": 0.157625, "grad_norm": 3.21875, "grad_norm_var": 0.028693644205729167, "learning_rate": 0.0001, "loss": 8.4396, "loss/crossentropy": 1.9031986594200134, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2655710130929947, "step": 2522 }, { "epoch": 0.15775, "grad_norm": 3.3125, "grad_norm_var": 0.029352823893229168, "learning_rate": 0.0001, "loss": 8.2257, "loss/crossentropy": 2.2168606519699097, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.261834055185318, "step": 2524 }, { "epoch": 0.157875, "grad_norm": 3.046875, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 8.4744, "loss/crossentropy": 2.310633659362793, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27251285314559937, "step": 2526 }, { "epoch": 0.158, "grad_norm": 3.0625, "grad_norm_var": 0.02662353515625, "learning_rate": 0.0001, "loss": 8.4234, "loss/crossentropy": 2.2944494485855103, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2710151672363281, "step": 2528 }, { "epoch": 0.158125, "grad_norm": 3.015625, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 8.0669, "loss/crossentropy": 2.162436366081238, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2699380964040756, "step": 2530 }, { "epoch": 0.15825, "grad_norm": 3.0625, "grad_norm_var": 0.024702962239583334, "learning_rate": 0.0001, "loss": 8.1866, "loss/crossentropy": 2.358327627182007, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28228873014450073, "step": 2532 }, { "epoch": 0.158375, "grad_norm": 3.25, "grad_norm_var": 0.020873006184895834, "learning_rate": 0.0001, "loss": 8.3448, "loss/crossentropy": 2.4418132305145264, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2814173698425293, "step": 2534 }, { "epoch": 0.1585, "grad_norm": 3.0, "grad_norm_var": 0.0148834228515625, "learning_rate": 0.0001, "loss": 8.1711, "loss/crossentropy": 2.194345235824585, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2765004634857178, "step": 2536 }, { "epoch": 0.158625, "grad_norm": 3.03125, "grad_norm_var": 0.01500244140625, "learning_rate": 0.0001, "loss": 8.4133, "loss/crossentropy": 2.492135763168335, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2749137431383133, "step": 2538 }, { "epoch": 0.15875, "grad_norm": 3.109375, "grad_norm_var": 0.011393229166666666, "learning_rate": 0.0001, "loss": 8.2062, "loss/crossentropy": 2.087536931037903, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2661558836698532, "step": 2540 }, { "epoch": 0.158875, "grad_norm": 2.984375, "grad_norm_var": 0.008968098958333334, "learning_rate": 0.0001, "loss": 8.5442, "loss/crossentropy": 2.5190815925598145, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2785092294216156, "step": 2542 }, { "epoch": 0.159, "grad_norm": 3.5, "grad_norm_var": 0.0190093994140625, "learning_rate": 0.0001, "loss": 8.5158, "loss/crossentropy": 2.410357117652893, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2825370132923126, "step": 2544 }, { "epoch": 0.159125, "grad_norm": 3.03125, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 8.0455, "loss/crossentropy": 2.469596028327942, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2713569104671478, "step": 2546 }, { "epoch": 0.15925, "grad_norm": 3.15625, "grad_norm_var": 0.0189605712890625, "learning_rate": 0.0001, "loss": 8.1763, "loss/crossentropy": 1.902215301990509, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.23949339985847473, "step": 2548 }, { "epoch": 0.159375, "grad_norm": 3.0625, "grad_norm_var": 0.018648274739583335, "learning_rate": 0.0001, "loss": 8.2739, "loss/crossentropy": 2.2176631689071655, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2655271142721176, "step": 2550 }, { "epoch": 0.1595, "grad_norm": 3.109375, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 8.253, "loss/crossentropy": 2.4577187299728394, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.25844304263591766, "step": 2552 }, { "epoch": 0.159625, "grad_norm": 3.4375, "grad_norm_var": 0.0239898681640625, "learning_rate": 0.0001, "loss": 8.3834, "loss/crossentropy": 2.0720095038414, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25736086815595627, "step": 2554 }, { "epoch": 0.15975, "grad_norm": 3.078125, "grad_norm_var": 0.024149576822916668, "learning_rate": 0.0001, "loss": 8.3121, "loss/crossentropy": 2.38494074344635, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.3003828078508377, "step": 2556 }, { "epoch": 0.159875, "grad_norm": 2.921875, "grad_norm_var": 0.0262603759765625, "learning_rate": 0.0001, "loss": 8.1596, "loss/crossentropy": 2.4485820531845093, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26067347824573517, "step": 2558 }, { "epoch": 0.16, "grad_norm": 3.484375, "grad_norm_var": 0.024283854166666667, "learning_rate": 0.0001, "loss": 8.4399, "loss/crossentropy": 2.1876609325408936, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25350525975227356, "step": 2560 }, { "epoch": 0.160125, "grad_norm": 5.0625, "grad_norm_var": 0.25761311848958335, "learning_rate": 0.0001, "loss": 8.381, "loss/crossentropy": 2.265621542930603, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28078918159008026, "step": 2562 }, { "epoch": 0.16025, "grad_norm": 3.5625, "grad_norm_var": 0.30728759765625, "learning_rate": 0.0001, "loss": 8.5677, "loss/crossentropy": 2.5903844833374023, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2908772826194763, "step": 2564 }, { "epoch": 0.160375, "grad_norm": 3.09375, "grad_norm_var": 0.3009104410807292, "learning_rate": 0.0001, "loss": 8.3239, "loss/crossentropy": 2.2390648126602173, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2874547988176346, "step": 2566 }, { "epoch": 0.1605, "grad_norm": 3.203125, "grad_norm_var": 0.3062408447265625, "learning_rate": 0.0001, "loss": 8.1255, "loss/crossentropy": 2.012996554374695, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.25241582095623016, "step": 2568 }, { "epoch": 0.160625, "grad_norm": 3.15625, "grad_norm_var": 0.3130523681640625, "learning_rate": 0.0001, "loss": 8.3869, "loss/crossentropy": 2.2517330646514893, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27007003128528595, "step": 2570 }, { "epoch": 0.16075, "grad_norm": 3.09375, "grad_norm_var": 0.3095448811848958, "learning_rate": 0.0001, "loss": 8.5403, "loss/crossentropy": 2.271014094352722, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2996182441711426, "step": 2572 }, { "epoch": 0.160875, "grad_norm": 3.0, "grad_norm_var": 0.3062164306640625, "learning_rate": 0.0001, "loss": 8.1903, "loss/crossentropy": 1.994364619255066, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26399780064821243, "step": 2574 }, { "epoch": 0.161, "grad_norm": 2.9375, "grad_norm_var": 0.3115386962890625, "learning_rate": 0.0001, "loss": 8.2641, "loss/crossentropy": 2.1744818687438965, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3009081333875656, "step": 2576 }, { "epoch": 0.161125, "grad_norm": 3.25, "grad_norm_var": 0.09748433430989584, "learning_rate": 0.0001, "loss": 8.265, "loss/crossentropy": 2.465830087661743, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2836051285266876, "step": 2578 }, { "epoch": 0.16125, "grad_norm": 3.34375, "grad_norm_var": 0.016535441080729168, "learning_rate": 0.0001, "loss": 8.5721, "loss/crossentropy": 2.098099946975708, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2927285134792328, "step": 2580 }, { "epoch": 0.161375, "grad_norm": 3.0, "grad_norm_var": 0.01763916015625, "learning_rate": 0.0001, "loss": 8.3492, "loss/crossentropy": 2.6066181659698486, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.3026330918073654, "step": 2582 }, { "epoch": 0.1615, "grad_norm": 2.984375, "grad_norm_var": 0.016927083333333332, "learning_rate": 0.0001, "loss": 8.2841, "loss/crossentropy": 2.3234163522720337, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26173534989356995, "step": 2584 }, { "epoch": 0.161625, "grad_norm": 3.59375, "grad_norm_var": 0.030692545572916667, "learning_rate": 0.0001, "loss": 8.3342, "loss/crossentropy": 2.3151192665100098, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2646416872739792, "step": 2586 }, { "epoch": 0.16175, "grad_norm": 3.0, "grad_norm_var": 0.03170572916666667, "learning_rate": 0.0001, "loss": 8.1608, "loss/crossentropy": 2.3148727416992188, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26169875264167786, "step": 2588 }, { "epoch": 0.161875, "grad_norm": 3.09375, "grad_norm_var": 0.0297271728515625, "learning_rate": 0.0001, "loss": 8.2822, "loss/crossentropy": 2.418124556541443, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2865563780069351, "step": 2590 }, { "epoch": 0.162, "grad_norm": 2.90625, "grad_norm_var": 0.035090128580729164, "learning_rate": 0.0001, "loss": 8.2814, "loss/crossentropy": 2.2745689153671265, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27167628705501556, "step": 2592 }, { "epoch": 0.162125, "grad_norm": 3.296875, "grad_norm_var": 0.03736572265625, "learning_rate": 0.0001, "loss": 8.1517, "loss/crossentropy": 2.1091307401657104, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26850171387195587, "step": 2594 }, { "epoch": 0.16225, "grad_norm": 3.015625, "grad_norm_var": 0.03427327473958333, "learning_rate": 0.0001, "loss": 8.2353, "loss/crossentropy": 2.3798282146453857, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2726765275001526, "step": 2596 }, { "epoch": 0.162375, "grad_norm": 2.890625, "grad_norm_var": 0.036473592122395836, "learning_rate": 0.0001, "loss": 8.1632, "loss/crossentropy": 2.2773178815841675, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25379926711320877, "step": 2598 }, { "epoch": 0.1625, "grad_norm": 3.265625, "grad_norm_var": 0.041624959309895834, "learning_rate": 0.0001, "loss": 8.6347, "loss/crossentropy": 2.6614561080932617, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28401175141334534, "step": 2600 }, { "epoch": 0.162625, "grad_norm": 3.0, "grad_norm_var": 0.024495442708333332, "learning_rate": 0.0001, "loss": 8.1829, "loss/crossentropy": 2.384236216545105, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2784354239702225, "step": 2602 }, { "epoch": 0.16275, "grad_norm": 3.0625, "grad_norm_var": 0.024714152018229168, "learning_rate": 0.0001, "loss": 8.2394, "loss/crossentropy": 2.282965302467346, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2925720512866974, "step": 2604 }, { "epoch": 0.162875, "grad_norm": 2.859375, "grad_norm_var": 0.026167805989583334, "learning_rate": 0.0001, "loss": 8.2496, "loss/crossentropy": 2.2551791667938232, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25895287841558456, "step": 2606 }, { "epoch": 0.163, "grad_norm": 3.046875, "grad_norm_var": 0.022004191080729166, "learning_rate": 0.0001, "loss": 8.3511, "loss/crossentropy": 2.298862099647522, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25684790313243866, "step": 2608 }, { "epoch": 0.163125, "grad_norm": 3.25, "grad_norm_var": 0.02080078125, "learning_rate": 0.0001, "loss": 8.3087, "loss/crossentropy": 2.253198981285095, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2626389414072037, "step": 2610 }, { "epoch": 0.16325, "grad_norm": 3.015625, "grad_norm_var": 0.0210601806640625, "learning_rate": 0.0001, "loss": 8.2759, "loss/crossentropy": 2.5446906089782715, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2750287652015686, "step": 2612 }, { "epoch": 0.163375, "grad_norm": 3.203125, "grad_norm_var": 0.019319661458333335, "learning_rate": 0.0001, "loss": 8.1388, "loss/crossentropy": 2.444550633430481, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2657383382320404, "step": 2614 }, { "epoch": 0.1635, "grad_norm": 3.15625, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 8.3399, "loss/crossentropy": 2.2695289850234985, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.28452712297439575, "step": 2616 }, { "epoch": 0.163625, "grad_norm": 2.890625, "grad_norm_var": 0.015477498372395834, "learning_rate": 0.0001, "loss": 8.1938, "loss/crossentropy": 2.3448396921157837, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26763126254081726, "step": 2618 }, { "epoch": 0.16375, "grad_norm": 2.9375, "grad_norm_var": 0.01607666015625, "learning_rate": 0.0001, "loss": 8.2201, "loss/crossentropy": 2.2416622638702393, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.24487657845020294, "step": 2620 }, { "epoch": 0.163875, "grad_norm": 3.328125, "grad_norm_var": 0.018114217122395835, "learning_rate": 0.0001, "loss": 8.2554, "loss/crossentropy": 2.0370543003082275, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.30543872714042664, "step": 2622 }, { "epoch": 0.164, "grad_norm": 2.8125, "grad_norm_var": 0.0245025634765625, "learning_rate": 0.0001, "loss": 8.0796, "loss/crossentropy": 2.2707594633102417, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26010069251060486, "step": 2624 }, { "epoch": 0.164125, "grad_norm": 3.171875, "grad_norm_var": 0.026399739583333335, "learning_rate": 0.0001, "loss": 8.2565, "loss/crossentropy": 2.085159659385681, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2566560357809067, "step": 2626 }, { "epoch": 0.16425, "grad_norm": 3.171875, "grad_norm_var": 0.025389607747395834, "learning_rate": 0.0001, "loss": 8.163, "loss/crossentropy": 2.165037214756012, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.255811408162117, "step": 2628 }, { "epoch": 0.164375, "grad_norm": 2.9375, "grad_norm_var": 0.023954264322916665, "learning_rate": 0.0001, "loss": 8.322, "loss/crossentropy": 2.383608818054199, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.29559099674224854, "step": 2630 }, { "epoch": 0.1645, "grad_norm": 3.140625, "grad_norm_var": 0.023688761393229167, "learning_rate": 0.0001, "loss": 8.4369, "loss/crossentropy": 2.5001370906829834, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2756764143705368, "step": 2632 }, { "epoch": 0.164625, "grad_norm": 3.0625, "grad_norm_var": 0.022684733072916668, "learning_rate": 0.0001, "loss": 8.2474, "loss/crossentropy": 2.153060555458069, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2610151022672653, "step": 2634 }, { "epoch": 0.16475, "grad_norm": 3.234375, "grad_norm_var": 0.02958984375, "learning_rate": 0.0001, "loss": 8.3281, "loss/crossentropy": 2.4711453914642334, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2531973719596863, "step": 2636 }, { "epoch": 0.164875, "grad_norm": 3.171875, "grad_norm_var": 0.028837076822916665, "learning_rate": 0.0001, "loss": 8.4733, "loss/crossentropy": 2.4670186042785645, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.30856695771217346, "step": 2638 }, { "epoch": 0.165, "grad_norm": 3.171875, "grad_norm_var": 0.022945149739583334, "learning_rate": 0.0001, "loss": 8.2444, "loss/crossentropy": 2.373382568359375, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2556057423353195, "step": 2640 }, { "epoch": 0.165125, "grad_norm": 3.25, "grad_norm_var": 0.016502888997395833, "learning_rate": 0.0001, "loss": 8.2022, "loss/crossentropy": 2.3559107780456543, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2673548609018326, "step": 2642 }, { "epoch": 0.16525, "grad_norm": 2.953125, "grad_norm_var": 0.01890869140625, "learning_rate": 0.0001, "loss": 8.149, "loss/crossentropy": 2.312580704689026, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2649365961551666, "step": 2644 }, { "epoch": 0.165375, "grad_norm": 3.34375, "grad_norm_var": 0.018871053059895834, "learning_rate": 0.0001, "loss": 8.1288, "loss/crossentropy": 2.0767895579338074, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2341424822807312, "step": 2646 }, { "epoch": 0.1655, "grad_norm": 2.984375, "grad_norm_var": 0.021610514322916666, "learning_rate": 0.0001, "loss": 8.0931, "loss/crossentropy": 2.3518717288970947, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.28567561507225037, "step": 2648 }, { "epoch": 0.165625, "grad_norm": 3.21875, "grad_norm_var": 0.021076456705729166, "learning_rate": 0.0001, "loss": 8.4698, "loss/crossentropy": 2.3555731773376465, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2730225473642349, "step": 2650 }, { "epoch": 0.16575, "grad_norm": 3.40625, "grad_norm_var": 0.025406901041666666, "learning_rate": 0.0001, "loss": 8.3373, "loss/crossentropy": 2.0926910042762756, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2553661912679672, "step": 2652 }, { "epoch": 0.165875, "grad_norm": 2.859375, "grad_norm_var": 0.027766927083333334, "learning_rate": 0.0001, "loss": 8.2932, "loss/crossentropy": 2.325496196746826, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27998365461826324, "step": 2654 }, { "epoch": 0.166, "grad_norm": 2.9375, "grad_norm_var": 0.029524739583333334, "learning_rate": 0.0001, "loss": 8.2462, "loss/crossentropy": 2.3335851430892944, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.272167906165123, "step": 2656 }, { "epoch": 0.166125, "grad_norm": 3.0625, "grad_norm_var": 0.028206380208333333, "learning_rate": 0.0001, "loss": 8.1733, "loss/crossentropy": 2.1524252891540527, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27060362696647644, "step": 2658 }, { "epoch": 0.16625, "grad_norm": 3.265625, "grad_norm_var": 0.0273101806640625, "learning_rate": 0.0001, "loss": 8.3935, "loss/crossentropy": 2.3421573638916016, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2588433101773262, "step": 2660 }, { "epoch": 0.166375, "grad_norm": 2.921875, "grad_norm_var": 0.027067057291666665, "learning_rate": 0.0001, "loss": 8.1186, "loss/crossentropy": 2.073794722557068, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24907249212265015, "step": 2662 }, { "epoch": 0.1665, "grad_norm": 3.140625, "grad_norm_var": 0.024605305989583333, "learning_rate": 0.0001, "loss": 8.2707, "loss/crossentropy": 2.2278562784194946, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2469905987381935, "step": 2664 }, { "epoch": 0.166625, "grad_norm": 2.984375, "grad_norm_var": 0.025472005208333332, "learning_rate": 0.0001, "loss": 8.0191, "loss/crossentropy": 2.48944628238678, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26759523153305054, "step": 2666 }, { "epoch": 0.16675, "grad_norm": 3.0, "grad_norm_var": 0.0146881103515625, "learning_rate": 0.0001, "loss": 8.1883, "loss/crossentropy": 2.166255533695221, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2581564337015152, "step": 2668 }, { "epoch": 0.166875, "grad_norm": 3.125, "grad_norm_var": 0.011751302083333333, "learning_rate": 0.0001, "loss": 8.2138, "loss/crossentropy": 2.3537596464157104, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.28053438663482666, "step": 2670 }, { "epoch": 0.167, "grad_norm": 2.984375, "grad_norm_var": 0.0102935791015625, "learning_rate": 0.0001, "loss": 8.1638, "loss/crossentropy": 2.3822021484375, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25109483301639557, "step": 2672 }, { "epoch": 0.167125, "grad_norm": 3.328125, "grad_norm_var": 0.014435831705729167, "learning_rate": 0.0001, "loss": 8.3125, "loss/crossentropy": 2.210346817970276, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2710355073213577, "step": 2674 }, { "epoch": 0.16725, "grad_norm": 3.203125, "grad_norm_var": 0.0143951416015625, "learning_rate": 0.0001, "loss": 8.0399, "loss/crossentropy": 2.1359152793884277, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2473316341638565, "step": 2676 }, { "epoch": 0.167375, "grad_norm": 2.921875, "grad_norm_var": 0.0133209228515625, "learning_rate": 0.0001, "loss": 8.2351, "loss/crossentropy": 2.4049233198165894, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2830816060304642, "step": 2678 }, { "epoch": 0.1675, "grad_norm": 2.96875, "grad_norm_var": 0.013166300455729167, "learning_rate": 0.0001, "loss": 8.1941, "loss/crossentropy": 2.3928329944610596, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.266268789768219, "step": 2680 }, { "epoch": 0.167625, "grad_norm": 3.421875, "grad_norm_var": 0.025748697916666667, "learning_rate": 0.0001, "loss": 8.0556, "loss/crossentropy": 2.024741470813751, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26838643848896027, "step": 2682 }, { "epoch": 0.16775, "grad_norm": 2.96875, "grad_norm_var": 0.049958292643229166, "learning_rate": 0.0001, "loss": 8.3848, "loss/crossentropy": 2.528537154197693, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2850952595472336, "step": 2684 }, { "epoch": 0.167875, "grad_norm": 3.046875, "grad_norm_var": 0.04983723958333333, "learning_rate": 0.0001, "loss": 8.3009, "loss/crossentropy": 2.2957894802093506, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26776544749736786, "step": 2686 }, { "epoch": 0.168, "grad_norm": 3.21875, "grad_norm_var": 0.05024312337239583, "learning_rate": 0.0001, "loss": 8.3334, "loss/crossentropy": 2.272274613380432, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2532171159982681, "step": 2688 }, { "epoch": 0.168125, "grad_norm": 2.96875, "grad_norm_var": 0.048981730143229166, "learning_rate": 0.0001, "loss": 8.337, "loss/crossentropy": 2.520447254180908, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2912859171628952, "step": 2690 }, { "epoch": 0.16825, "grad_norm": 2.953125, "grad_norm_var": 0.0484771728515625, "learning_rate": 0.0001, "loss": 8.0313, "loss/crossentropy": 2.2402195930480957, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2597721219062805, "step": 2692 }, { "epoch": 0.168375, "grad_norm": 3.03125, "grad_norm_var": 0.045796712239583336, "learning_rate": 0.0001, "loss": 8.16, "loss/crossentropy": 2.263728380203247, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25052615255117416, "step": 2694 }, { "epoch": 0.1685, "grad_norm": 3.0, "grad_norm_var": 0.04702046712239583, "learning_rate": 0.0001, "loss": 8.2191, "loss/crossentropy": 2.2724695205688477, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27686507999897003, "step": 2696 }, { "epoch": 0.168625, "grad_norm": 3.15625, "grad_norm_var": 0.038557942708333334, "learning_rate": 0.0001, "loss": 8.292, "loss/crossentropy": 2.169528841972351, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.252603217959404, "step": 2698 }, { "epoch": 0.16875, "grad_norm": 4.5, "grad_norm_var": 0.14544169108072916, "learning_rate": 0.0001, "loss": 8.0871, "loss/crossentropy": 2.375905990600586, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2765290290117264, "step": 2700 }, { "epoch": 0.168875, "grad_norm": 3.015625, "grad_norm_var": 0.15084228515625, "learning_rate": 0.0001, "loss": 8.3753, "loss/crossentropy": 2.3945010900497437, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27985604107379913, "step": 2702 }, { "epoch": 0.169, "grad_norm": 3.140625, "grad_norm_var": 0.14921468098958332, "learning_rate": 0.0001, "loss": 8.0489, "loss/crossentropy": 2.1550697684288025, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2562442794442177, "step": 2704 }, { "epoch": 0.169125, "grad_norm": 2.875, "grad_norm_var": 0.15213216145833333, "learning_rate": 0.0001, "loss": 8.0596, "loss/crossentropy": 2.1525893211364746, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2613821029663086, "step": 2706 }, { "epoch": 0.16925, "grad_norm": 2.953125, "grad_norm_var": 0.1527008056640625, "learning_rate": 0.0001, "loss": 8.2609, "loss/crossentropy": 2.194769859313965, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2507399097084999, "step": 2708 }, { "epoch": 0.169375, "grad_norm": 3.3125, "grad_norm_var": 0.1499420166015625, "learning_rate": 0.0001, "loss": 8.3803, "loss/crossentropy": 2.373349666595459, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2778366059064865, "step": 2710 }, { "epoch": 0.1695, "grad_norm": 3.0625, "grad_norm_var": 0.14553629557291667, "learning_rate": 0.0001, "loss": 8.2339, "loss/crossentropy": 2.3904805183410645, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2910989373922348, "step": 2712 }, { "epoch": 0.169625, "grad_norm": 2.796875, "grad_norm_var": 0.15755106608072916, "learning_rate": 0.0001, "loss": 8.0392, "loss/crossentropy": 2.21162748336792, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24572212994098663, "step": 2714 }, { "epoch": 0.16975, "grad_norm": 3.09375, "grad_norm_var": 0.034375, "learning_rate": 0.0001, "loss": 8.1972, "loss/crossentropy": 2.2650893926620483, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27223196625709534, "step": 2716 }, { "epoch": 0.169875, "grad_norm": 3.078125, "grad_norm_var": 0.030208333333333334, "learning_rate": 0.0001, "loss": 8.2521, "loss/crossentropy": 2.410581946372986, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27209579944610596, "step": 2718 }, { "epoch": 0.17, "grad_norm": 3.0, "grad_norm_var": 0.03357747395833333, "learning_rate": 0.0001, "loss": 8.149, "loss/crossentropy": 2.4662805795669556, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25760146975517273, "step": 2720 }, { "epoch": 0.170125, "grad_norm": 3.09375, "grad_norm_var": 0.030296834309895833, "learning_rate": 0.0001, "loss": 8.2252, "loss/crossentropy": 2.275286078453064, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.263226181268692, "step": 2722 }, { "epoch": 0.17025, "grad_norm": 3.0625, "grad_norm_var": 0.026862589518229167, "learning_rate": 0.0001, "loss": 8.2922, "loss/crossentropy": 2.4460601806640625, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28739283978939056, "step": 2724 }, { "epoch": 0.170375, "grad_norm": 3.0, "grad_norm_var": 0.0196197509765625, "learning_rate": 0.0001, "loss": 8.0972, "loss/crossentropy": 2.414928674697876, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28981567919254303, "step": 2726 }, { "epoch": 0.1705, "grad_norm": 3.09375, "grad_norm_var": 0.0107086181640625, "learning_rate": 0.0001, "loss": 8.4286, "loss/crossentropy": 2.3859044313430786, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2801547795534134, "step": 2728 }, { "epoch": 0.170625, "grad_norm": 3.3125, "grad_norm_var": 0.010384114583333333, "learning_rate": 0.0001, "loss": 8.3648, "loss/crossentropy": 2.2060307264328003, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26368267089128494, "step": 2730 }, { "epoch": 0.17075, "grad_norm": 3.359375, "grad_norm_var": 0.015478515625, "learning_rate": 0.0001, "loss": 8.5339, "loss/crossentropy": 2.4007495641708374, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27564837038517, "step": 2732 }, { "epoch": 0.170875, "grad_norm": 3.3125, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 8.176, "loss/crossentropy": 2.2184962034225464, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2526983246207237, "step": 2734 }, { "epoch": 0.171, "grad_norm": 2.78125, "grad_norm_var": 0.019562784830729166, "learning_rate": 0.0001, "loss": 8.2518, "loss/crossentropy": 2.403747081756592, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.28357672691345215, "step": 2736 }, { "epoch": 0.171125, "grad_norm": 4.25, "grad_norm_var": 0.11033426920572917, "learning_rate": 0.0001, "loss": 8.3945, "loss/crossentropy": 2.2106932997703552, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27614726126194, "step": 2738 }, { "epoch": 0.17125, "grad_norm": 2.96875, "grad_norm_var": 0.11243082682291666, "learning_rate": 0.0001, "loss": 8.1762, "loss/crossentropy": 2.366937756538391, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2758289873600006, "step": 2740 }, { "epoch": 0.171375, "grad_norm": 3.15625, "grad_norm_var": 0.10838216145833333, "learning_rate": 0.0001, "loss": 8.1801, "loss/crossentropy": 2.6269803047180176, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2783343344926834, "step": 2742 }, { "epoch": 0.1715, "grad_norm": 2.78125, "grad_norm_var": 0.12049153645833334, "learning_rate": 0.0001, "loss": 8.087, "loss/crossentropy": 2.2394570112228394, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26301658153533936, "step": 2744 }, { "epoch": 0.171625, "grad_norm": 3.09375, "grad_norm_var": 0.11782938639322917, "learning_rate": 0.0001, "loss": 8.4104, "loss/crossentropy": 2.2185282707214355, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.28459644317626953, "step": 2746 }, { "epoch": 0.17175, "grad_norm": 3.5, "grad_norm_var": 0.12354227701822916, "learning_rate": 0.0001, "loss": 8.3685, "loss/crossentropy": 2.4350966215133667, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2675904631614685, "step": 2748 }, { "epoch": 0.171875, "grad_norm": 2.953125, "grad_norm_var": 0.12983296712239584, "learning_rate": 0.0001, "loss": 8.1103, "loss/crossentropy": 2.10713529586792, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26895423233509064, "step": 2750 }, { "epoch": 0.172, "grad_norm": 2.953125, "grad_norm_var": 0.12000223795572916, "learning_rate": 0.0001, "loss": 8.2241, "loss/crossentropy": 2.1709011793136597, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24941477179527283, "step": 2752 }, { "epoch": 0.172125, "grad_norm": 2.953125, "grad_norm_var": 0.039094034830729166, "learning_rate": 0.0001, "loss": 8.1094, "loss/crossentropy": 2.4355965852737427, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27485549449920654, "step": 2754 }, { "epoch": 0.17225, "grad_norm": 3.25, "grad_norm_var": 0.03870035807291667, "learning_rate": 0.0001, "loss": 8.176, "loss/crossentropy": 2.2021514177322388, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24435579776763916, "step": 2756 }, { "epoch": 0.172375, "grad_norm": 2.9375, "grad_norm_var": 0.041193644205729164, "learning_rate": 0.0001, "loss": 8.0871, "loss/crossentropy": 2.2676401138305664, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24549178779125214, "step": 2758 }, { "epoch": 0.1725, "grad_norm": 2.875, "grad_norm_var": 0.037333170572916664, "learning_rate": 0.0001, "loss": 8.3252, "loss/crossentropy": 2.266029477119446, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26920171082019806, "step": 2760 }, { "epoch": 0.172625, "grad_norm": 3.0, "grad_norm_var": 0.04195556640625, "learning_rate": 0.0001, "loss": 8.3167, "loss/crossentropy": 2.2225186824798584, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2658109962940216, "step": 2762 }, { "epoch": 0.17275, "grad_norm": 3.015625, "grad_norm_var": 0.021712239583333334, "learning_rate": 0.0001, "loss": 8.3062, "loss/crossentropy": 2.5298062562942505, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2963249981403351, "step": 2764 }, { "epoch": 0.172875, "grad_norm": 3.109375, "grad_norm_var": 0.011735026041666667, "learning_rate": 0.0001, "loss": 8.1575, "loss/crossentropy": 2.093632698059082, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2585790827870369, "step": 2766 }, { "epoch": 0.173, "grad_norm": 2.9375, "grad_norm_var": 0.010530598958333333, "learning_rate": 0.0001, "loss": 8.2698, "loss/crossentropy": 2.2221652269363403, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2734009772539139, "step": 2768 }, { "epoch": 0.173125, "grad_norm": 2.984375, "grad_norm_var": 0.011909993489583333, "learning_rate": 0.0001, "loss": 8.0252, "loss/crossentropy": 2.214139223098755, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2616540938615799, "step": 2770 }, { "epoch": 0.17325, "grad_norm": 3.078125, "grad_norm_var": 0.012109375, "learning_rate": 0.0001, "loss": 8.18, "loss/crossentropy": 2.2513784170150757, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.25692541897296906, "step": 2772 }, { "epoch": 0.173375, "grad_norm": 2.8125, "grad_norm_var": 0.015184529622395833, "learning_rate": 0.0001, "loss": 8.1285, "loss/crossentropy": 2.4102020263671875, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2653050720691681, "step": 2774 }, { "epoch": 0.1735, "grad_norm": 3.140625, "grad_norm_var": 0.014876302083333333, "learning_rate": 0.0001, "loss": 8.1504, "loss/crossentropy": 2.2256804704666138, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26175589859485626, "step": 2776 }, { "epoch": 0.173625, "grad_norm": 2.921875, "grad_norm_var": 0.013297526041666667, "learning_rate": 0.0001, "loss": 8.2647, "loss/crossentropy": 2.562616467475891, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28543052077293396, "step": 2778 }, { "epoch": 0.17375, "grad_norm": 2.859375, "grad_norm_var": 0.013963826497395833, "learning_rate": 0.0001, "loss": 8.0907, "loss/crossentropy": 2.1320207118988037, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2546294927597046, "step": 2780 }, { "epoch": 0.173875, "grad_norm": 3.21875, "grad_norm_var": 0.020556640625, "learning_rate": 0.0001, "loss": 8.3402, "loss/crossentropy": 2.2618422508239746, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27197229862213135, "step": 2782 }, { "epoch": 0.174, "grad_norm": 3.28125, "grad_norm_var": 0.024723307291666666, "learning_rate": 0.0001, "loss": 8.4636, "loss/crossentropy": 2.625874876976013, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27097639441490173, "step": 2784 }, { "epoch": 0.174125, "grad_norm": 2.90625, "grad_norm_var": 0.023558553059895834, "learning_rate": 0.0001, "loss": 8.1304, "loss/crossentropy": 2.170462965965271, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.28276196122169495, "step": 2786 }, { "epoch": 0.17425, "grad_norm": 3.078125, "grad_norm_var": 0.024290974934895834, "learning_rate": 0.0001, "loss": 8.0283, "loss/crossentropy": 2.2857601642608643, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2574136555194855, "step": 2788 }, { "epoch": 0.174375, "grad_norm": 3.09375, "grad_norm_var": 0.020978800455729165, "learning_rate": 0.0001, "loss": 8.1718, "loss/crossentropy": 2.1768821477890015, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26678016781806946, "step": 2790 }, { "epoch": 0.1745, "grad_norm": 3.140625, "grad_norm_var": 0.027762858072916667, "learning_rate": 0.0001, "loss": 8.3611, "loss/crossentropy": 2.337521195411682, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26058705151081085, "step": 2792 }, { "epoch": 0.174625, "grad_norm": 3.0, "grad_norm_var": 0.0259185791015625, "learning_rate": 0.0001, "loss": 8.2887, "loss/crossentropy": 2.417194366455078, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2650887668132782, "step": 2794 }, { "epoch": 0.17475, "grad_norm": 3.125, "grad_norm_var": 0.021305338541666666, "learning_rate": 0.0001, "loss": 8.2086, "loss/crossentropy": 2.3959646224975586, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25554417073726654, "step": 2796 }, { "epoch": 0.174875, "grad_norm": 3.140625, "grad_norm_var": 0.02144775390625, "learning_rate": 0.0001, "loss": 7.9649, "loss/crossentropy": 2.2579997777938843, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24610213935375214, "step": 2798 }, { "epoch": 0.175, "grad_norm": 2.984375, "grad_norm_var": 0.01875, "learning_rate": 0.0001, "loss": 8.1861, "loss/crossentropy": 2.402603507041931, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27010577917099, "step": 2800 }, { "epoch": 0.175125, "grad_norm": 2.984375, "grad_norm_var": 0.018773396809895832, "learning_rate": 0.0001, "loss": 8.0684, "loss/crossentropy": 2.487064242362976, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26188019663095474, "step": 2802 }, { "epoch": 0.17525, "grad_norm": 2.921875, "grad_norm_var": 0.016380818684895833, "learning_rate": 0.0001, "loss": 8.0874, "loss/crossentropy": 2.27841317653656, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25010599195957184, "step": 2804 }, { "epoch": 0.175375, "grad_norm": 3.3125, "grad_norm_var": 0.027131144205729166, "learning_rate": 0.0001, "loss": 8.2989, "loss/crossentropy": 2.4073312282562256, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2679283916950226, "step": 2806 }, { "epoch": 0.1755, "grad_norm": 3.40625, "grad_norm_var": 0.0291015625, "learning_rate": 0.0001, "loss": 8.4547, "loss/crossentropy": 2.3957384824752808, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27833716571331024, "step": 2808 }, { "epoch": 0.175625, "grad_norm": 3.03125, "grad_norm_var": 0.0291015625, "learning_rate": 0.0001, "loss": 8.2216, "loss/crossentropy": 2.447432279586792, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.283155158162117, "step": 2810 }, { "epoch": 0.17575, "grad_norm": 3.171875, "grad_norm_var": 0.03322652180989583, "learning_rate": 0.0001, "loss": 8.1123, "loss/crossentropy": 2.1782784461975098, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24672221392393112, "step": 2812 }, { "epoch": 0.175875, "grad_norm": 3.03125, "grad_norm_var": 0.0337890625, "learning_rate": 0.0001, "loss": 8.2015, "loss/crossentropy": 2.194298505783081, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.25301510840654373, "step": 2814 }, { "epoch": 0.176, "grad_norm": 3.046875, "grad_norm_var": 0.035309855143229166, "learning_rate": 0.0001, "loss": 8.258, "loss/crossentropy": 2.2716476917266846, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28211984038352966, "step": 2816 }, { "epoch": 0.176125, "grad_norm": 3.078125, "grad_norm_var": 0.032548014322916666, "learning_rate": 0.0001, "loss": 8.1436, "loss/crossentropy": 2.4797651767730713, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27206259965896606, "step": 2818 }, { "epoch": 0.17625, "grad_norm": 3.171875, "grad_norm_var": 0.033610026041666664, "learning_rate": 0.0001, "loss": 8.1555, "loss/crossentropy": 2.3100684881210327, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25597915798425674, "step": 2820 }, { "epoch": 0.176375, "grad_norm": 3.0, "grad_norm_var": 0.025153605143229167, "learning_rate": 0.0001, "loss": 8.19, "loss/crossentropy": 2.343941330909729, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26012492179870605, "step": 2822 }, { "epoch": 0.1765, "grad_norm": 2.953125, "grad_norm_var": 0.01533203125, "learning_rate": 0.0001, "loss": 8.1131, "loss/crossentropy": 2.218177556991577, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2628052681684494, "step": 2824 }, { "epoch": 0.176625, "grad_norm": 3.0, "grad_norm_var": 0.0157135009765625, "learning_rate": 0.0001, "loss": 8.3048, "loss/crossentropy": 2.3974251747131348, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.28162911534309387, "step": 2826 }, { "epoch": 0.17675, "grad_norm": 3.109375, "grad_norm_var": 0.012743123372395833, "learning_rate": 0.0001, "loss": 8.1691, "loss/crossentropy": 2.1724337339401245, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26139407604932785, "step": 2828 }, { "epoch": 0.176875, "grad_norm": 3.015625, "grad_norm_var": 0.007258097330729167, "learning_rate": 0.0001, "loss": 8.2618, "loss/crossentropy": 2.3914555311203003, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.28328216075897217, "step": 2830 }, { "epoch": 0.177, "grad_norm": 2.828125, "grad_norm_var": 0.00855712890625, "learning_rate": 0.0001, "loss": 7.9935, "loss/crossentropy": 2.235588788986206, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2594834715127945, "step": 2832 }, { "epoch": 0.177125, "grad_norm": 3.046875, "grad_norm_var": 0.0100006103515625, "learning_rate": 0.0001, "loss": 8.3621, "loss/crossentropy": 2.38160240650177, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.28699810802936554, "step": 2834 }, { "epoch": 0.17725, "grad_norm": 2.96875, "grad_norm_var": 0.0061187744140625, "learning_rate": 0.0001, "loss": 8.2658, "loss/crossentropy": 2.5288161039352417, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2768819034099579, "step": 2836 }, { "epoch": 0.177375, "grad_norm": 2.859375, "grad_norm_var": 0.0096099853515625, "learning_rate": 0.0001, "loss": 7.8831, "loss/crossentropy": 2.262709617614746, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2727932184934616, "step": 2838 }, { "epoch": 0.1775, "grad_norm": 3.03125, "grad_norm_var": 0.010016886393229167, "learning_rate": 0.0001, "loss": 8.3451, "loss/crossentropy": 2.432405710220337, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27632567286491394, "step": 2840 }, { "epoch": 0.177625, "grad_norm": 2.859375, "grad_norm_var": 0.011197916666666667, "learning_rate": 0.0001, "loss": 8.1573, "loss/crossentropy": 2.217153549194336, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2539759650826454, "step": 2842 }, { "epoch": 0.17775, "grad_norm": 3.234375, "grad_norm_var": 0.05241597493489583, "learning_rate": 0.0001, "loss": 8.5061, "loss/crossentropy": 2.3605984449386597, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2897241413593292, "step": 2844 }, { "epoch": 0.177875, "grad_norm": 3.125, "grad_norm_var": 0.0523590087890625, "learning_rate": 0.0001, "loss": 8.0759, "loss/crossentropy": 2.083876132965088, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2714761793613434, "step": 2846 }, { "epoch": 0.178, "grad_norm": 3.375, "grad_norm_var": 0.055680338541666666, "learning_rate": 0.0001, "loss": 8.4974, "loss/crossentropy": 2.539989471435547, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28579047322273254, "step": 2848 }, { "epoch": 0.178125, "grad_norm": 3.265625, "grad_norm_var": 0.05750325520833333, "learning_rate": 0.0001, "loss": 8.2717, "loss/crossentropy": 2.558953881263733, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2836534380912781, "step": 2850 }, { "epoch": 0.17825, "grad_norm": 3.0, "grad_norm_var": 0.05771484375, "learning_rate": 0.0001, "loss": 8.2571, "loss/crossentropy": 2.3250139951705933, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25975073873996735, "step": 2852 }, { "epoch": 0.178375, "grad_norm": 3.015625, "grad_norm_var": 0.050032552083333334, "learning_rate": 0.0001, "loss": 8.24, "loss/crossentropy": 2.257322072982788, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26378054916858673, "step": 2854 }, { "epoch": 0.1785, "grad_norm": 3.015625, "grad_norm_var": 0.04654541015625, "learning_rate": 0.0001, "loss": 8.5173, "loss/crossentropy": 2.4077916145324707, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.30688565969467163, "step": 2856 }, { "epoch": 0.178625, "grad_norm": 3.25, "grad_norm_var": 0.0427642822265625, "learning_rate": 0.0001, "loss": 8.2647, "loss/crossentropy": 2.3220880031585693, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2572537362575531, "step": 2858 }, { "epoch": 0.17875, "grad_norm": 2.96875, "grad_norm_var": 0.019269816080729165, "learning_rate": 0.0001, "loss": 8.1931, "loss/crossentropy": 2.392123222351074, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2863081991672516, "step": 2860 }, { "epoch": 0.178875, "grad_norm": 3.03125, "grad_norm_var": 0.021647135416666668, "learning_rate": 0.0001, "loss": 8.284, "loss/crossentropy": 2.2904245853424072, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2629016935825348, "step": 2862 }, { "epoch": 0.179, "grad_norm": 2.9375, "grad_norm_var": 0.017724609375, "learning_rate": 0.0001, "loss": 7.9001, "loss/crossentropy": 2.10800838470459, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25615420937538147, "step": 2864 }, { "epoch": 0.179125, "grad_norm": 3.015625, "grad_norm_var": 0.015380859375, "learning_rate": 0.0001, "loss": 8.1166, "loss/crossentropy": 2.113399863243103, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2571730315685272, "step": 2866 }, { "epoch": 0.17925, "grad_norm": 3.078125, "grad_norm_var": 0.01324462890625, "learning_rate": 0.0001, "loss": 8.3192, "loss/crossentropy": 2.5056850910186768, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27596791088581085, "step": 2868 }, { "epoch": 0.179375, "grad_norm": 2.96875, "grad_norm_var": 0.0142974853515625, "learning_rate": 0.0001, "loss": 8.0736, "loss/crossentropy": 2.1984351873397827, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.22599153965711594, "step": 2870 }, { "epoch": 0.1795, "grad_norm": 2.921875, "grad_norm_var": 0.013288370768229167, "learning_rate": 0.0001, "loss": 8.1654, "loss/crossentropy": 2.398763060569763, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2709758132696152, "step": 2872 }, { "epoch": 0.179625, "grad_norm": 3.578125, "grad_norm_var": 0.031859334309895834, "learning_rate": 0.0001, "loss": 8.246, "loss/crossentropy": 2.279433012008667, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26721224188804626, "step": 2874 }, { "epoch": 0.17975, "grad_norm": 3.578125, "grad_norm_var": 0.09832356770833334, "learning_rate": 0.0001, "loss": 8.6018, "loss/crossentropy": 2.429903745651245, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.3210333585739136, "step": 2876 }, { "epoch": 0.179875, "grad_norm": 2.90625, "grad_norm_var": 0.10014546712239583, "learning_rate": 0.0001, "loss": 8.3066, "loss/crossentropy": 2.235803484916687, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.265362948179245, "step": 2878 }, { "epoch": 0.18, "grad_norm": 2.9375, "grad_norm_var": 0.10035400390625, "learning_rate": 0.0001, "loss": 8.2791, "loss/crossentropy": 2.2972434759140015, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25388309359550476, "step": 2880 }, { "epoch": 0.180125, "grad_norm": 3.25, "grad_norm_var": 0.10048828125, "learning_rate": 0.0001, "loss": 8.3889, "loss/crossentropy": 2.3344578742980957, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25023628771305084, "step": 2882 }, { "epoch": 0.18025, "grad_norm": 3.046875, "grad_norm_var": 0.10738525390625, "learning_rate": 0.0001, "loss": 8.2306, "loss/crossentropy": 2.3195676803588867, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27837128937244415, "step": 2884 }, { "epoch": 0.180375, "grad_norm": 2.921875, "grad_norm_var": 0.1024810791015625, "learning_rate": 0.0001, "loss": 8.0015, "loss/crossentropy": 2.394813656806946, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25014276802539825, "step": 2886 }, { "epoch": 0.1805, "grad_norm": 2.90625, "grad_norm_var": 0.11116129557291667, "learning_rate": 0.0001, "loss": 7.8888, "loss/crossentropy": 2.122212529182434, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2572604715824127, "step": 2888 }, { "epoch": 0.180625, "grad_norm": 3.25, "grad_norm_var": 0.09462890625, "learning_rate": 0.0001, "loss": 8.2487, "loss/crossentropy": 2.3987185955047607, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.28120650351047516, "step": 2890 }, { "epoch": 0.18075, "grad_norm": 3.109375, "grad_norm_var": 0.03023681640625, "learning_rate": 0.0001, "loss": 8.2168, "loss/crossentropy": 2.4273810386657715, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2616555094718933, "step": 2892 }, { "epoch": 0.180875, "grad_norm": 2.8125, "grad_norm_var": 0.035065714518229166, "learning_rate": 0.0001, "loss": 8.1309, "loss/crossentropy": 2.4763081073760986, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26364801824092865, "step": 2894 }, { "epoch": 0.181, "grad_norm": 3.015625, "grad_norm_var": 0.03367411295572917, "learning_rate": 0.0001, "loss": 8.1263, "loss/crossentropy": 2.194410800933838, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.260297030210495, "step": 2896 }, { "epoch": 0.181125, "grad_norm": 2.78125, "grad_norm_var": 0.029508463541666665, "learning_rate": 0.0001, "loss": 8.0331, "loss/crossentropy": 2.2540624141693115, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26236794888973236, "step": 2898 }, { "epoch": 0.18125, "grad_norm": 2.890625, "grad_norm_var": 0.0299957275390625, "learning_rate": 0.0001, "loss": 8.041, "loss/crossentropy": 2.181529998779297, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2571089118719101, "step": 2900 }, { "epoch": 0.181375, "grad_norm": 3.28125, "grad_norm_var": 0.03355204264322917, "learning_rate": 0.0001, "loss": 8.4286, "loss/crossentropy": 2.3706891536712646, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.29068616032600403, "step": 2902 }, { "epoch": 0.1815, "grad_norm": 3.046875, "grad_norm_var": 0.032063802083333336, "learning_rate": 0.0001, "loss": 8.322, "loss/crossentropy": 2.4488571882247925, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25543512403964996, "step": 2904 }, { "epoch": 0.181625, "grad_norm": 2.875, "grad_norm_var": 0.027962239583333333, "learning_rate": 0.0001, "loss": 8.1152, "loss/crossentropy": 2.252503991127014, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25314611941576004, "step": 2906 }, { "epoch": 0.18175, "grad_norm": 2.90625, "grad_norm_var": 0.025829060872395834, "learning_rate": 0.0001, "loss": 8.0455, "loss/crossentropy": 2.0501604080200195, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23801933228969574, "step": 2908 }, { "epoch": 0.181875, "grad_norm": 3.328125, "grad_norm_var": 0.030696614583333334, "learning_rate": 0.0001, "loss": 8.2281, "loss/crossentropy": 2.1372073888778687, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2622874677181244, "step": 2910 }, { "epoch": 0.182, "grad_norm": 3.453125, "grad_norm_var": 0.7165191650390625, "learning_rate": 0.0001, "loss": 8.3927, "loss/crossentropy": 2.423773169517517, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2817266881465912, "step": 2912 }, { "epoch": 0.182125, "grad_norm": 3.234375, "grad_norm_var": 0.6845011393229167, "learning_rate": 0.0001, "loss": 8.3796, "loss/crossentropy": 2.2435269355773926, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2818005681037903, "step": 2914 }, { "epoch": 0.18225, "grad_norm": 4.53125, "grad_norm_var": 0.7542154947916667, "learning_rate": 0.0001, "loss": 8.1857, "loss/crossentropy": 2.295470118522644, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28818129003047943, "step": 2916 }, { "epoch": 0.182375, "grad_norm": 3.640625, "grad_norm_var": 0.7405588785807292, "learning_rate": 0.0001, "loss": 8.3772, "loss/crossentropy": 2.2698758840560913, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2839386910200119, "step": 2918 }, { "epoch": 0.1825, "grad_norm": 2.984375, "grad_norm_var": 0.7203084309895833, "learning_rate": 0.0001, "loss": 8.25, "loss/crossentropy": 2.4441301822662354, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26232658326625824, "step": 2920 }, { "epoch": 0.182625, "grad_norm": 3.078125, "grad_norm_var": 0.7089192708333333, "learning_rate": 0.0001, "loss": 8.3258, "loss/crossentropy": 2.4217323064804077, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2775084972381592, "step": 2922 }, { "epoch": 0.18275, "grad_norm": 3.1875, "grad_norm_var": 0.6976399739583333, "learning_rate": 0.0001, "loss": 8.2529, "loss/crossentropy": 2.181835651397705, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2539723962545395, "step": 2924 }, { "epoch": 0.182875, "grad_norm": 2.953125, "grad_norm_var": 0.7229400634765625, "learning_rate": 0.0001, "loss": 8.3196, "loss/crossentropy": 2.4659998416900635, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27879883348941803, "step": 2926 }, { "epoch": 0.183, "grad_norm": 2.796875, "grad_norm_var": 0.18976236979166666, "learning_rate": 0.0001, "loss": 8.1181, "loss/crossentropy": 2.387602686882019, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27245059609413147, "step": 2928 }, { "epoch": 0.183125, "grad_norm": 3.0625, "grad_norm_var": 0.18590087890625, "learning_rate": 0.0001, "loss": 8.2374, "loss/crossentropy": 2.31194806098938, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26923683285713196, "step": 2930 }, { "epoch": 0.18325, "grad_norm": 3.078125, "grad_norm_var": 0.18599344889322916, "learning_rate": 0.0001, "loss": 8.1433, "loss/crossentropy": 2.1902053356170654, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2595588266849518, "step": 2932 }, { "epoch": 0.183375, "grad_norm": 2.953125, "grad_norm_var": 0.1603668212890625, "learning_rate": 0.0001, "loss": 8.1451, "loss/crossentropy": 2.359446108341217, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26731616258621216, "step": 2934 }, { "epoch": 0.1835, "grad_norm": 3.0625, "grad_norm_var": 0.1576812744140625, "learning_rate": 0.0001, "loss": 8.2641, "loss/crossentropy": 2.2995764017105103, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26409196853637695, "step": 2936 }, { "epoch": 0.183625, "grad_norm": 3.625, "grad_norm_var": 0.1730621337890625, "learning_rate": 0.0001, "loss": 8.3355, "loss/crossentropy": 2.4295204877853394, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28618867695331573, "step": 2938 }, { "epoch": 0.18375, "grad_norm": 2.90625, "grad_norm_var": 0.1745025634765625, "learning_rate": 0.0001, "loss": 8.3801, "loss/crossentropy": 2.6878920793533325, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28163112699985504, "step": 2940 }, { "epoch": 0.183875, "grad_norm": 3.078125, "grad_norm_var": 0.17681884765625, "learning_rate": 0.0001, "loss": 8.2906, "loss/crossentropy": 2.4048426151275635, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2580362558364868, "step": 2942 }, { "epoch": 0.184, "grad_norm": 2.953125, "grad_norm_var": 0.17281494140625, "learning_rate": 0.0001, "loss": 8.1785, "loss/crossentropy": 2.345201253890991, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2763300687074661, "step": 2944 }, { "epoch": 0.184125, "grad_norm": 3.078125, "grad_norm_var": 0.16936848958333334, "learning_rate": 0.0001, "loss": 8.2876, "loss/crossentropy": 2.6029101610183716, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2681310623884201, "step": 2946 }, { "epoch": 0.18425, "grad_norm": 2.921875, "grad_norm_var": 0.036799112955729164, "learning_rate": 0.0001, "loss": 8.2017, "loss/crossentropy": 2.372221827507019, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2688012570142746, "step": 2948 }, { "epoch": 0.184375, "grad_norm": 3.15625, "grad_norm_var": 0.03762613932291667, "learning_rate": 0.0001, "loss": 8.158, "loss/crossentropy": 2.4124748706817627, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2789239138364792, "step": 2950 }, { "epoch": 0.1845, "grad_norm": 2.96875, "grad_norm_var": 0.038248697916666664, "learning_rate": 0.0001, "loss": 8.1871, "loss/crossentropy": 2.4088337421417236, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2847772538661957, "step": 2952 }, { "epoch": 0.184625, "grad_norm": 3.0, "grad_norm_var": 0.01422119140625, "learning_rate": 0.0001, "loss": 8.1798, "loss/crossentropy": 2.482720732688904, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26483161747455597, "step": 2954 }, { "epoch": 0.18475, "grad_norm": 2.84375, "grad_norm_var": 0.015876261393229167, "learning_rate": 0.0001, "loss": 7.9673, "loss/crossentropy": 2.3646737337112427, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24630165100097656, "step": 2956 }, { "epoch": 0.184875, "grad_norm": 2.953125, "grad_norm_var": 0.0157379150390625, "learning_rate": 0.0001, "loss": 8.1089, "loss/crossentropy": 2.3218533992767334, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26656070351600647, "step": 2958 }, { "epoch": 0.185, "grad_norm": 2.984375, "grad_norm_var": 0.015607706705729167, "learning_rate": 0.0001, "loss": 8.2027, "loss/crossentropy": 2.2734180688858032, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25169242173433304, "step": 2960 }, { "epoch": 0.185125, "grad_norm": 3.0, "grad_norm_var": 0.013084920247395833, "learning_rate": 0.0001, "loss": 8.2544, "loss/crossentropy": 2.438475728034973, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2564696967601776, "step": 2962 }, { "epoch": 0.18525, "grad_norm": 2.90625, "grad_norm_var": 0.0137847900390625, "learning_rate": 0.0001, "loss": 8.1306, "loss/crossentropy": 2.2471766471862793, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.25111958384513855, "step": 2964 }, { "epoch": 0.185375, "grad_norm": 2.984375, "grad_norm_var": 0.011921183268229166, "learning_rate": 0.0001, "loss": 8.2375, "loss/crossentropy": 2.4147186279296875, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2586153745651245, "step": 2966 }, { "epoch": 0.1855, "grad_norm": 2.75, "grad_norm_var": 0.0188873291015625, "learning_rate": 0.0001, "loss": 8.3192, "loss/crossentropy": 2.4494996070861816, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2895759344100952, "step": 2968 }, { "epoch": 0.185625, "grad_norm": 3.109375, "grad_norm_var": 0.0251129150390625, "learning_rate": 0.0001, "loss": 8.339, "loss/crossentropy": 2.3236255645751953, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.30765844881534576, "step": 2970 }, { "epoch": 0.18575, "grad_norm": 3.203125, "grad_norm_var": 0.023909505208333334, "learning_rate": 0.0001, "loss": 8.0738, "loss/crossentropy": 2.198439598083496, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26555734872817993, "step": 2972 }, { "epoch": 0.185875, "grad_norm": 3.0, "grad_norm_var": 0.0239410400390625, "learning_rate": 0.0001, "loss": 8.2006, "loss/crossentropy": 2.0953043699264526, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26947975903749466, "step": 2974 }, { "epoch": 0.186, "grad_norm": 3.140625, "grad_norm_var": 0.019661458333333333, "learning_rate": 0.0001, "loss": 8.4606, "loss/crossentropy": 2.4526013135910034, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.28150199353694916, "step": 2976 }, { "epoch": 0.186125, "grad_norm": 2.9375, "grad_norm_var": 0.020637003580729167, "learning_rate": 0.0001, "loss": 8.2076, "loss/crossentropy": 2.2855358123779297, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25133074820041656, "step": 2978 }, { "epoch": 0.18625, "grad_norm": 2.828125, "grad_norm_var": 0.022802734375, "learning_rate": 0.0001, "loss": 8.3427, "loss/crossentropy": 2.376999020576477, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26656532287597656, "step": 2980 }, { "epoch": 0.186375, "grad_norm": 3.09375, "grad_norm_var": 0.027046712239583333, "learning_rate": 0.0001, "loss": 8.1237, "loss/crossentropy": 2.2938228845596313, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24355412274599075, "step": 2982 }, { "epoch": 0.1865, "grad_norm": 3.125, "grad_norm_var": 0.022379557291666668, "learning_rate": 0.0001, "loss": 8.2351, "loss/crossentropy": 2.2552013397216797, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2814163714647293, "step": 2984 }, { "epoch": 0.186625, "grad_norm": 3.140625, "grad_norm_var": 0.017366536458333335, "learning_rate": 0.0001, "loss": 8.0533, "loss/crossentropy": 2.53173291683197, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2805483788251877, "step": 2986 }, { "epoch": 0.18675, "grad_norm": 3.0625, "grad_norm_var": 0.016178385416666666, "learning_rate": 0.0001, "loss": 8.0591, "loss/crossentropy": 2.2584705352783203, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2641800567507744, "step": 2988 }, { "epoch": 0.186875, "grad_norm": 2.921875, "grad_norm_var": 0.022163899739583333, "learning_rate": 0.0001, "loss": 7.9337, "loss/crossentropy": 1.8534721732139587, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2394864708185196, "step": 2990 }, { "epoch": 0.187, "grad_norm": 3.0625, "grad_norm_var": 0.023420206705729165, "learning_rate": 0.0001, "loss": 8.223, "loss/crossentropy": 2.293165445327759, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25678662955760956, "step": 2992 }, { "epoch": 0.187125, "grad_norm": 2.796875, "grad_norm_var": 0.030631510416666667, "learning_rate": 0.0001, "loss": 8.0176, "loss/crossentropy": 2.2423532009124756, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24015968292951584, "step": 2994 }, { "epoch": 0.18725, "grad_norm": 3.203125, "grad_norm_var": 0.032763671875, "learning_rate": 0.0001, "loss": 8.0765, "loss/crossentropy": 2.11912739276886, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27838442474603653, "step": 2996 }, { "epoch": 0.187375, "grad_norm": 3.109375, "grad_norm_var": 0.03322652180989583, "learning_rate": 0.0001, "loss": 8.332, "loss/crossentropy": 2.430270552635193, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2781721204519272, "step": 2998 }, { "epoch": 0.1875, "grad_norm": 2.984375, "grad_norm_var": 0.030078125, "learning_rate": 0.0001, "loss": 8.0788, "loss/crossentropy": 2.3152072429656982, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24442951381206512, "step": 3000 }, { "epoch": 0.187625, "grad_norm": 2.875, "grad_norm_var": 0.032080078125, "learning_rate": 0.0001, "loss": 7.8221, "loss/crossentropy": 2.044616162776947, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.22671421617269516, "step": 3002 }, { "epoch": 0.18775, "grad_norm": 3.140625, "grad_norm_var": 0.0315826416015625, "learning_rate": 0.0001, "loss": 8.0428, "loss/crossentropy": 1.980787992477417, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24500887095928192, "step": 3004 }, { "epoch": 0.187875, "grad_norm": 3.21875, "grad_norm_var": 0.04780171712239583, "learning_rate": 0.0001, "loss": 8.3364, "loss/crossentropy": 2.239225387573242, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2766413688659668, "step": 3006 }, { "epoch": 0.188, "grad_norm": 2.8125, "grad_norm_var": 0.049046834309895836, "learning_rate": 0.0001, "loss": 8.1033, "loss/crossentropy": 2.448020100593567, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26164938509464264, "step": 3008 }, { "epoch": 0.188125, "grad_norm": 3.046875, "grad_norm_var": 0.040673828125, "learning_rate": 0.0001, "loss": 8.3013, "loss/crossentropy": 2.4478694200515747, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25723396241664886, "step": 3010 }, { "epoch": 0.18825, "grad_norm": 2.90625, "grad_norm_var": 0.037083943684895836, "learning_rate": 0.0001, "loss": 8.0114, "loss/crossentropy": 2.2931246757507324, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25216321647167206, "step": 3012 }, { "epoch": 0.188375, "grad_norm": 2.875, "grad_norm_var": 0.036031087239583336, "learning_rate": 0.0001, "loss": 8.0789, "loss/crossentropy": 2.2818782329559326, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2645493298768997, "step": 3014 }, { "epoch": 0.1885, "grad_norm": 3.390625, "grad_norm_var": 0.043944295247395834, "learning_rate": 0.0001, "loss": 8.0871, "loss/crossentropy": 2.1904850006103516, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2534702569246292, "step": 3016 }, { "epoch": 0.188625, "grad_norm": 3.0625, "grad_norm_var": 0.040827433268229164, "learning_rate": 0.0001, "loss": 8.2858, "loss/crossentropy": 2.5582823753356934, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.28889524936676025, "step": 3018 }, { "epoch": 0.18875, "grad_norm": 3.140625, "grad_norm_var": 0.044896443684895836, "learning_rate": 0.0001, "loss": 8.3075, "loss/crossentropy": 2.334454655647278, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.25536222755908966, "step": 3020 }, { "epoch": 0.188875, "grad_norm": 3.09375, "grad_norm_var": 0.025641886393229167, "learning_rate": 0.0001, "loss": 8.0683, "loss/crossentropy": 2.330474376678467, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2533309534192085, "step": 3022 }, { "epoch": 0.189, "grad_norm": 3.125, "grad_norm_var": 0.025373331705729165, "learning_rate": 0.0001, "loss": 8.111, "loss/crossentropy": 2.410144090652466, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26289787888526917, "step": 3024 }, { "epoch": 0.189125, "grad_norm": 2.9375, "grad_norm_var": 0.024149576822916668, "learning_rate": 0.0001, "loss": 8.4391, "loss/crossentropy": 2.5078091621398926, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28905192017555237, "step": 3026 }, { "epoch": 0.18925, "grad_norm": 3.09375, "grad_norm_var": 0.024283854166666667, "learning_rate": 0.0001, "loss": 8.213, "loss/crossentropy": 2.3117668628692627, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.3114188015460968, "step": 3028 }, { "epoch": 0.189375, "grad_norm": 3.0625, "grad_norm_var": 0.022379557291666668, "learning_rate": 0.0001, "loss": 8.3104, "loss/crossentropy": 2.4223134517669678, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2804117053747177, "step": 3030 }, { "epoch": 0.1895, "grad_norm": 3.109375, "grad_norm_var": 0.016145833333333335, "learning_rate": 0.0001, "loss": 8.1046, "loss/crossentropy": 2.44333016872406, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2601943165063858, "step": 3032 }, { "epoch": 0.189625, "grad_norm": 3.28125, "grad_norm_var": 0.0273590087890625, "learning_rate": 0.0001, "loss": 8.171, "loss/crossentropy": 2.2447644472122192, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25088611245155334, "step": 3034 }, { "epoch": 0.18975, "grad_norm": 3.234375, "grad_norm_var": 0.0401031494140625, "learning_rate": 0.0001, "loss": 8.1771, "loss/crossentropy": 2.2574959993362427, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.300536185503006, "step": 3036 }, { "epoch": 0.189875, "grad_norm": 2.984375, "grad_norm_var": 0.04713134765625, "learning_rate": 0.0001, "loss": 8.1283, "loss/crossentropy": 2.2428945302963257, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27497410774230957, "step": 3038 }, { "epoch": 0.19, "grad_norm": 2.78125, "grad_norm_var": 0.05065104166666667, "learning_rate": 0.0001, "loss": 8.0159, "loss/crossentropy": 2.2476083040237427, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2652427852153778, "step": 3040 }, { "epoch": 0.190125, "grad_norm": 2.921875, "grad_norm_var": 0.050862630208333336, "learning_rate": 0.0001, "loss": 8.0181, "loss/crossentropy": 2.2924695014953613, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2591032460331917, "step": 3042 }, { "epoch": 0.19025, "grad_norm": 2.9375, "grad_norm_var": 0.050902303059895834, "learning_rate": 0.0001, "loss": 7.8949, "loss/crossentropy": 2.1451542377471924, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25656062364578247, "step": 3044 }, { "epoch": 0.190375, "grad_norm": 2.9375, "grad_norm_var": 0.051806640625, "learning_rate": 0.0001, "loss": 8.0506, "loss/crossentropy": 2.396212339401245, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.291217640042305, "step": 3046 }, { "epoch": 0.1905, "grad_norm": 2.75, "grad_norm_var": 0.05533447265625, "learning_rate": 0.0001, "loss": 8.1359, "loss/crossentropy": 2.4078985452651978, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2804025709629059, "step": 3048 }, { "epoch": 0.190625, "grad_norm": 2.78125, "grad_norm_var": 0.04224344889322917, "learning_rate": 0.0001, "loss": 8.0119, "loss/crossentropy": 2.522601842880249, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.29702451825141907, "step": 3050 }, { "epoch": 0.19075, "grad_norm": 2.96875, "grad_norm_var": 0.010400390625, "learning_rate": 0.0001, "loss": 7.8972, "loss/crossentropy": 2.4399465322494507, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2680388540029526, "step": 3052 }, { "epoch": 0.190875, "grad_norm": 3.28125, "grad_norm_var": 0.016974894205729167, "learning_rate": 0.0001, "loss": 7.9247, "loss/crossentropy": 2.208235263824463, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2559407502412796, "step": 3054 }, { "epoch": 0.191, "grad_norm": 2.90625, "grad_norm_var": 0.015119425455729167, "learning_rate": 0.0001, "loss": 8.1328, "loss/crossentropy": 2.2488074898719788, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25658509135246277, "step": 3056 }, { "epoch": 0.191125, "grad_norm": 2.984375, "grad_norm_var": 0.014533487955729167, "learning_rate": 0.0001, "loss": 8.1646, "loss/crossentropy": 2.3440630435943604, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.294276162981987, "step": 3058 }, { "epoch": 0.19125, "grad_norm": 2.921875, "grad_norm_var": 0.0190582275390625, "learning_rate": 0.0001, "loss": 8.112, "loss/crossentropy": 2.434723734855652, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25768575817346573, "step": 3060 }, { "epoch": 0.191375, "grad_norm": 2.890625, "grad_norm_var": 0.019969685872395834, "learning_rate": 0.0001, "loss": 8.128, "loss/crossentropy": 2.2569202184677124, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2592715919017792, "step": 3062 }, { "epoch": 0.1915, "grad_norm": 2.875, "grad_norm_var": 0.019196573893229166, "learning_rate": 0.0001, "loss": 7.9687, "loss/crossentropy": 2.238947808742523, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26189571619033813, "step": 3064 }, { "epoch": 0.191625, "grad_norm": 3.015625, "grad_norm_var": 0.0163970947265625, "learning_rate": 0.0001, "loss": 8.1887, "loss/crossentropy": 2.1487661600112915, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26625922322273254, "step": 3066 }, { "epoch": 0.19175, "grad_norm": 2.8125, "grad_norm_var": 0.0185546875, "learning_rate": 0.0001, "loss": 8.2271, "loss/crossentropy": 2.0519703030586243, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24336016178131104, "step": 3068 }, { "epoch": 0.191875, "grad_norm": 2.78125, "grad_norm_var": 0.016650390625, "learning_rate": 0.0001, "loss": 7.8515, "loss/crossentropy": 2.22737193107605, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24473804980516434, "step": 3070 }, { "epoch": 0.192, "grad_norm": 3.03125, "grad_norm_var": 0.017308553059895832, "learning_rate": 0.0001, "loss": 8.0573, "loss/crossentropy": 2.188886821269989, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.23682686686515808, "step": 3072 }, { "epoch": 0.192125, "grad_norm": 3.15625, "grad_norm_var": 0.10769755045572917, "learning_rate": 0.0001, "loss": 8.3074, "loss/crossentropy": 2.458534598350525, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25693877041339874, "step": 3074 }, { "epoch": 0.19225, "grad_norm": 2.90625, "grad_norm_var": 0.10537821451822917, "learning_rate": 0.0001, "loss": 8.2244, "loss/crossentropy": 2.522627353668213, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27664047479629517, "step": 3076 }, { "epoch": 0.192375, "grad_norm": 2.953125, "grad_norm_var": 0.10487874348958333, "learning_rate": 0.0001, "loss": 8.3394, "loss/crossentropy": 2.576285243034363, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27181732654571533, "step": 3078 }, { "epoch": 0.1925, "grad_norm": 3.09375, "grad_norm_var": 0.10526936848958333, "learning_rate": 0.0001, "loss": 8.0066, "loss/crossentropy": 2.0819945335388184, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23909074068069458, "step": 3080 }, { "epoch": 0.192625, "grad_norm": 3.265625, "grad_norm_var": 0.1159332275390625, "learning_rate": 0.0001, "loss": 8.1581, "loss/crossentropy": 2.2708539962768555, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2756985127925873, "step": 3082 }, { "epoch": 0.19275, "grad_norm": 2.859375, "grad_norm_var": 0.11389973958333334, "learning_rate": 0.0001, "loss": 8.2785, "loss/crossentropy": 2.39635694026947, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2778060585260391, "step": 3084 }, { "epoch": 0.192875, "grad_norm": 3.078125, "grad_norm_var": 0.10504557291666666, "learning_rate": 0.0001, "loss": 8.2355, "loss/crossentropy": 2.3772268295288086, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2498919665813446, "step": 3086 }, { "epoch": 0.193, "grad_norm": 3.3125, "grad_norm_var": 0.10641276041666667, "learning_rate": 0.0001, "loss": 8.1642, "loss/crossentropy": 2.166185975074768, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2886776030063629, "step": 3088 }, { "epoch": 0.193125, "grad_norm": 2.84375, "grad_norm_var": 0.04262593587239583, "learning_rate": 0.0001, "loss": 8.0373, "loss/crossentropy": 2.174901783466339, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2579093724489212, "step": 3090 }, { "epoch": 0.19325, "grad_norm": 3.078125, "grad_norm_var": 0.0406402587890625, "learning_rate": 0.0001, "loss": 8.2906, "loss/crossentropy": 2.4586178064346313, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26945915818214417, "step": 3092 }, { "epoch": 0.193375, "grad_norm": 3.140625, "grad_norm_var": 0.04579976399739583, "learning_rate": 0.0001, "loss": 8.0433, "loss/crossentropy": 2.2675609588623047, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24192240834236145, "step": 3094 }, { "epoch": 0.1935, "grad_norm": 2.90625, "grad_norm_var": 0.04088134765625, "learning_rate": 0.0001, "loss": 8.1895, "loss/crossentropy": 2.378737211227417, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27523770928382874, "step": 3096 }, { "epoch": 0.193625, "grad_norm": 2.96875, "grad_norm_var": 0.038655598958333336, "learning_rate": 0.0001, "loss": 8.1775, "loss/crossentropy": 2.2450079917907715, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26037096977233887, "step": 3098 }, { "epoch": 0.19375, "grad_norm": 2.859375, "grad_norm_var": 0.04511617024739583, "learning_rate": 0.0001, "loss": 7.9961, "loss/crossentropy": 2.214326858520508, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24653871357440948, "step": 3100 }, { "epoch": 0.193875, "grad_norm": 3.046875, "grad_norm_var": 0.04381510416666667, "learning_rate": 0.0001, "loss": 8.1401, "loss/crossentropy": 2.153132200241089, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26044490188360214, "step": 3102 }, { "epoch": 0.194, "grad_norm": 2.84375, "grad_norm_var": 0.03082275390625, "learning_rate": 0.0001, "loss": 8.0079, "loss/crossentropy": 2.1559587717056274, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24349378049373627, "step": 3104 }, { "epoch": 0.194125, "grad_norm": 3.25, "grad_norm_var": 0.03339436848958333, "learning_rate": 0.0001, "loss": 8.2171, "loss/crossentropy": 2.144322395324707, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2607101500034332, "step": 3106 }, { "epoch": 0.19425, "grad_norm": 3.09375, "grad_norm_var": 0.17703348795572918, "learning_rate": 0.0001, "loss": 8.2271, "loss/crossentropy": 2.158595383167267, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25443463027477264, "step": 3108 }, { "epoch": 0.194375, "grad_norm": 3.265625, "grad_norm_var": 0.17183837890625, "learning_rate": 0.0001, "loss": 8.2891, "loss/crossentropy": 2.3929585218429565, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2684243321418762, "step": 3110 }, { "epoch": 0.1945, "grad_norm": 3.203125, "grad_norm_var": 0.2350250244140625, "learning_rate": 0.0001, "loss": 8.2582, "loss/crossentropy": 2.22498095035553, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2694687396287918, "step": 3112 }, { "epoch": 0.194625, "grad_norm": 2.984375, "grad_norm_var": 0.23277587890625, "learning_rate": 0.0001, "loss": 8.2521, "loss/crossentropy": 2.2842044830322266, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2741314470767975, "step": 3114 }, { "epoch": 0.19475, "grad_norm": 3.109375, "grad_norm_var": 0.208984375, "learning_rate": 0.0001, "loss": 8.4313, "loss/crossentropy": 2.719553828239441, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28248198330402374, "step": 3116 }, { "epoch": 0.194875, "grad_norm": 2.90625, "grad_norm_var": 0.21138916015625, "learning_rate": 0.0001, "loss": 8.0922, "loss/crossentropy": 2.3392093181610107, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24478980898857117, "step": 3118 }, { "epoch": 0.195, "grad_norm": 2.859375, "grad_norm_var": 0.20575764973958333, "learning_rate": 0.0001, "loss": 8.1143, "loss/crossentropy": 2.2137898802757263, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25172895193099976, "step": 3120 }, { "epoch": 0.195125, "grad_norm": 3.015625, "grad_norm_var": 0.20693257649739583, "learning_rate": 0.0001, "loss": 7.8959, "loss/crossentropy": 2.22554612159729, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2681921124458313, "step": 3122 }, { "epoch": 0.19525, "grad_norm": 2.65625, "grad_norm_var": 0.1068359375, "learning_rate": 0.0001, "loss": 8.1436, "loss/crossentropy": 2.3785455226898193, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2584487646818161, "step": 3124 }, { "epoch": 0.195375, "grad_norm": 3.03125, "grad_norm_var": 0.1123687744140625, "learning_rate": 0.0001, "loss": 8.0, "loss/crossentropy": 2.149886727333069, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2602705806493759, "step": 3126 }, { "epoch": 0.1955, "grad_norm": 2.84375, "grad_norm_var": 0.038248697916666664, "learning_rate": 0.0001, "loss": 8.0842, "loss/crossentropy": 2.262367606163025, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26558272540569305, "step": 3128 }, { "epoch": 0.195625, "grad_norm": 2.90625, "grad_norm_var": 0.04039713541666667, "learning_rate": 0.0001, "loss": 7.9579, "loss/crossentropy": 2.3544100522994995, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25334376096725464, "step": 3130 }, { "epoch": 0.19575, "grad_norm": 3.09375, "grad_norm_var": 0.040135701497395836, "learning_rate": 0.0001, "loss": 8.1151, "loss/crossentropy": 2.2972534894943237, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26104989647865295, "step": 3132 }, { "epoch": 0.195875, "grad_norm": 3.046875, "grad_norm_var": 0.03992411295572917, "learning_rate": 0.0001, "loss": 8.1342, "loss/crossentropy": 2.1989673376083374, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.277900829911232, "step": 3134 }, { "epoch": 0.196, "grad_norm": 2.9375, "grad_norm_var": 0.039460245768229166, "learning_rate": 0.0001, "loss": 8.0714, "loss/crossentropy": 2.4464385509490967, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2850402891635895, "step": 3136 }, { "epoch": 0.196125, "grad_norm": 3.0625, "grad_norm_var": 0.030094401041666666, "learning_rate": 0.0001, "loss": 8.0838, "loss/crossentropy": 2.524294137954712, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2605943828821182, "step": 3138 }, { "epoch": 0.19625, "grad_norm": 2.859375, "grad_norm_var": 0.02447509765625, "learning_rate": 0.0001, "loss": 8.2011, "loss/crossentropy": 2.336963415145874, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2699812203645706, "step": 3140 }, { "epoch": 0.196375, "grad_norm": 2.8125, "grad_norm_var": 0.025927734375, "learning_rate": 0.0001, "loss": 8.0793, "loss/crossentropy": 2.0558972358703613, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23027782887220383, "step": 3142 }, { "epoch": 0.1965, "grad_norm": 3.0, "grad_norm_var": 0.011995442708333333, "learning_rate": 0.0001, "loss": 8.0777, "loss/crossentropy": 2.488237142562866, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2672172784805298, "step": 3144 }, { "epoch": 0.196625, "grad_norm": 2.59375, "grad_norm_var": 0.018488566080729168, "learning_rate": 0.0001, "loss": 7.9179, "loss/crossentropy": 2.1932790279388428, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24824626743793488, "step": 3146 }, { "epoch": 0.19675, "grad_norm": 3.546875, "grad_norm_var": 0.04078369140625, "learning_rate": 0.0001, "loss": 8.3246, "loss/crossentropy": 2.5695748329162598, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.29574574530124664, "step": 3148 }, { "epoch": 0.196875, "grad_norm": 2.828125, "grad_norm_var": 0.041112263997395836, "learning_rate": 0.0001, "loss": 7.97, "loss/crossentropy": 2.1290723085403442, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25089605897665024, "step": 3150 }, { "epoch": 0.197, "grad_norm": 3.15625, "grad_norm_var": 0.04358622233072917, "learning_rate": 0.0001, "loss": 8.2776, "loss/crossentropy": 2.170577347278595, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2691800892353058, "step": 3152 }, { "epoch": 0.197125, "grad_norm": 2.796875, "grad_norm_var": 0.05865478515625, "learning_rate": 0.0001, "loss": 8.2212, "loss/crossentropy": 2.328927516937256, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26250624656677246, "step": 3154 }, { "epoch": 0.19725, "grad_norm": 2.875, "grad_norm_var": 0.060498046875, "learning_rate": 0.0001, "loss": 7.9754, "loss/crossentropy": 1.8585203289985657, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24215184152126312, "step": 3156 }, { "epoch": 0.197375, "grad_norm": 3.015625, "grad_norm_var": 0.05597330729166667, "learning_rate": 0.0001, "loss": 7.9847, "loss/crossentropy": 2.3386768102645874, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24663898348808289, "step": 3158 }, { "epoch": 0.1975, "grad_norm": 2.828125, "grad_norm_var": 0.059956868489583336, "learning_rate": 0.0001, "loss": 8.0886, "loss/crossentropy": 2.3307093381881714, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2650502547621727, "step": 3160 }, { "epoch": 0.197625, "grad_norm": 3.484375, "grad_norm_var": 0.06946614583333334, "learning_rate": 0.0001, "loss": 8.2584, "loss/crossentropy": 2.4045934677124023, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26501110196113586, "step": 3162 }, { "epoch": 0.19775, "grad_norm": 3.21875, "grad_norm_var": 0.0585357666015625, "learning_rate": 0.0001, "loss": 8.2483, "loss/crossentropy": 2.2774007320404053, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26311296224594116, "step": 3164 }, { "epoch": 0.197875, "grad_norm": 3.140625, "grad_norm_var": 0.060212198893229166, "learning_rate": 0.0001, "loss": 8.075, "loss/crossentropy": 2.447927236557007, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28195714950561523, "step": 3166 }, { "epoch": 0.198, "grad_norm": 2.9375, "grad_norm_var": 0.05999348958333333, "learning_rate": 0.0001, "loss": 7.9802, "loss/crossentropy": 2.1790822744369507, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2555471360683441, "step": 3168 }, { "epoch": 0.198125, "grad_norm": 2.90625, "grad_norm_var": 0.04986979166666667, "learning_rate": 0.0001, "loss": 8.0028, "loss/crossentropy": 2.2405868768692017, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2627221643924713, "step": 3170 }, { "epoch": 0.19825, "grad_norm": 2.921875, "grad_norm_var": 0.048216756184895834, "learning_rate": 0.0001, "loss": 8.0685, "loss/crossentropy": 2.2628936767578125, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2600160911679268, "step": 3172 }, { "epoch": 0.198375, "grad_norm": 3.109375, "grad_norm_var": 0.050191243489583336, "learning_rate": 0.0001, "loss": 8.184, "loss/crossentropy": 2.193703293800354, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2586003839969635, "step": 3174 }, { "epoch": 0.1985, "grad_norm": 3.125, "grad_norm_var": 0.046483357747395836, "learning_rate": 0.0001, "loss": 8.3664, "loss/crossentropy": 2.391955852508545, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2850872576236725, "step": 3176 }, { "epoch": 0.198625, "grad_norm": 2.984375, "grad_norm_var": 0.019287109375, "learning_rate": 0.0001, "loss": 7.8907, "loss/crossentropy": 1.952785313129425, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24282608181238174, "step": 3178 }, { "epoch": 0.19875, "grad_norm": 3.015625, "grad_norm_var": 0.014460245768229166, "learning_rate": 0.0001, "loss": 8.1804, "loss/crossentropy": 2.3660178184509277, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2768784761428833, "step": 3180 }, { "epoch": 0.198875, "grad_norm": 3.0625, "grad_norm_var": 0.010904947916666666, "learning_rate": 0.0001, "loss": 8.1394, "loss/crossentropy": 2.3029706478118896, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2709043174982071, "step": 3182 }, { "epoch": 0.199, "grad_norm": 2.890625, "grad_norm_var": 0.011458333333333333, "learning_rate": 0.0001, "loss": 8.1912, "loss/crossentropy": 2.247406482696533, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27382896840572357, "step": 3184 }, { "epoch": 0.199125, "grad_norm": 3.1875, "grad_norm_var": 0.013084920247395833, "learning_rate": 0.0001, "loss": 8.0932, "loss/crossentropy": 2.283021092414856, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2655387371778488, "step": 3186 }, { "epoch": 0.19925, "grad_norm": 2.84375, "grad_norm_var": 0.011986287434895833, "learning_rate": 0.0001, "loss": 8.1942, "loss/crossentropy": 2.3839281797409058, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2643355578184128, "step": 3188 }, { "epoch": 0.199375, "grad_norm": 2.953125, "grad_norm_var": 0.011979166666666667, "learning_rate": 0.0001, "loss": 8.1036, "loss/crossentropy": 2.3398349285125732, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2707003504037857, "step": 3190 }, { "epoch": 0.1995, "grad_norm": 2.90625, "grad_norm_var": 0.0106597900390625, "learning_rate": 0.0001, "loss": 7.9858, "loss/crossentropy": 2.2766858339309692, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26129937171936035, "step": 3192 }, { "epoch": 0.199625, "grad_norm": 5.59375, "grad_norm_var": 0.43753255208333336, "learning_rate": 0.0001, "loss": 8.4462, "loss/crossentropy": 2.1764395236968994, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2510588988661766, "step": 3194 }, { "epoch": 0.19975, "grad_norm": 3.484375, "grad_norm_var": 0.44677632649739585, "learning_rate": 0.0001, "loss": 8.3175, "loss/crossentropy": 2.365424394607544, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.29546721279621124, "step": 3196 }, { "epoch": 0.199875, "grad_norm": 2.859375, "grad_norm_var": 0.45133056640625, "learning_rate": 0.0001, "loss": 8.0542, "loss/crossentropy": 2.315553069114685, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2633578032255173, "step": 3198 }, { "epoch": 0.2, "grad_norm": 2.84375, "grad_norm_var": 0.45888671875, "learning_rate": 0.0001, "loss": 8.0282, "loss/crossentropy": 2.266456127166748, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24378645420074463, "step": 3200 }, { "epoch": 0.200125, "grad_norm": 3.125, "grad_norm_var": 0.45402730305989586, "learning_rate": 0.0001, "loss": 8.2908, "loss/crossentropy": 2.457024335861206, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27127622067928314, "step": 3202 }, { "epoch": 0.20025, "grad_norm": 3.03125, "grad_norm_var": 0.44961649576822915, "learning_rate": 0.0001, "loss": 8.0589, "loss/crossentropy": 2.238759994506836, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24700668454170227, "step": 3204 }, { "epoch": 0.200375, "grad_norm": 3.21875, "grad_norm_var": 0.45494384765625, "learning_rate": 0.0001, "loss": 8.2079, "loss/crossentropy": 2.1566935777664185, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2526049539446831, "step": 3206 }, { "epoch": 0.2005, "grad_norm": 2.796875, "grad_norm_var": 0.45370686848958336, "learning_rate": 0.0001, "loss": 8.2606, "loss/crossentropy": 2.3832950592041016, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26506198942661285, "step": 3208 }, { "epoch": 0.200625, "grad_norm": 3.5625, "grad_norm_var": 0.07243550618489583, "learning_rate": 0.0001, "loss": 8.0725, "loss/crossentropy": 2.2661877870559692, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2683081030845642, "step": 3210 }, { "epoch": 0.20075, "grad_norm": 2.8125, "grad_norm_var": 0.05592041015625, "learning_rate": 0.0001, "loss": 8.2014, "loss/crossentropy": 2.455717086791992, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2620306462049484, "step": 3212 }, { "epoch": 0.200875, "grad_norm": 3.15625, "grad_norm_var": 0.045670572916666666, "learning_rate": 0.0001, "loss": 8.2718, "loss/crossentropy": 2.3754059076309204, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27477268874645233, "step": 3214 }, { "epoch": 0.201, "grad_norm": 2.8125, "grad_norm_var": 0.042529296875, "learning_rate": 0.0001, "loss": 8.0394, "loss/crossentropy": 2.190205931663513, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2606823891401291, "step": 3216 }, { "epoch": 0.201125, "grad_norm": 2.9375, "grad_norm_var": 0.0420318603515625, "learning_rate": 0.0001, "loss": 8.2165, "loss/crossentropy": 2.421746611595154, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2913060784339905, "step": 3218 }, { "epoch": 0.20125, "grad_norm": 3.140625, "grad_norm_var": 0.0412109375, "learning_rate": 0.0001, "loss": 8.1104, "loss/crossentropy": 2.221086263656616, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25914373993873596, "step": 3220 }, { "epoch": 0.201375, "grad_norm": 2.84375, "grad_norm_var": 0.038863118489583334, "learning_rate": 0.0001, "loss": 8.1592, "loss/crossentropy": 2.4511712789535522, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2737935334444046, "step": 3222 }, { "epoch": 0.2015, "grad_norm": 3.0625, "grad_norm_var": 0.035628255208333334, "learning_rate": 0.0001, "loss": 8.0365, "loss/crossentropy": 2.123769164085388, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.23229189217090607, "step": 3224 }, { "epoch": 0.201625, "grad_norm": 2.796875, "grad_norm_var": 0.017650349934895834, "learning_rate": 0.0001, "loss": 8.0493, "loss/crossentropy": 2.178188681602478, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2590486854314804, "step": 3226 }, { "epoch": 0.20175, "grad_norm": 3.171875, "grad_norm_var": 0.01549072265625, "learning_rate": 0.0001, "loss": 7.9211, "loss/crossentropy": 2.4011971950531006, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25986021757125854, "step": 3228 }, { "epoch": 0.201875, "grad_norm": 2.8125, "grad_norm_var": 0.03179931640625, "learning_rate": 0.0001, "loss": 8.2361, "loss/crossentropy": 2.5181671380996704, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2665943503379822, "step": 3230 }, { "epoch": 0.202, "grad_norm": 3.046875, "grad_norm_var": 0.028483072916666668, "learning_rate": 0.0001, "loss": 8.1181, "loss/crossentropy": 2.1627626419067383, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2532324343919754, "step": 3232 }, { "epoch": 0.202125, "grad_norm": 2.90625, "grad_norm_var": 0.0302886962890625, "learning_rate": 0.0001, "loss": 8.0974, "loss/crossentropy": 2.4595226049423218, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2654995620250702, "step": 3234 }, { "epoch": 0.20225, "grad_norm": 2.953125, "grad_norm_var": 0.03790690104166667, "learning_rate": 0.0001, "loss": 7.87, "loss/crossentropy": 2.2840529680252075, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25225698947906494, "step": 3236 }, { "epoch": 0.202375, "grad_norm": 3.0625, "grad_norm_var": 0.03756103515625, "learning_rate": 0.0001, "loss": 8.0822, "loss/crossentropy": 2.3052114248275757, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.254858136177063, "step": 3238 }, { "epoch": 0.2025, "grad_norm": 2.921875, "grad_norm_var": 0.0365234375, "learning_rate": 0.0001, "loss": 8.0767, "loss/crossentropy": 2.3538131713867188, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2715635895729065, "step": 3240 }, { "epoch": 0.202625, "grad_norm": 3.09375, "grad_norm_var": 0.034989420572916666, "learning_rate": 0.0001, "loss": 8.1945, "loss/crossentropy": 2.387078642845154, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26866745948791504, "step": 3242 }, { "epoch": 0.20275, "grad_norm": 3.109375, "grad_norm_var": 0.035302734375, "learning_rate": 0.0001, "loss": 8.2018, "loss/crossentropy": 2.474520683288574, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26195022463798523, "step": 3244 }, { "epoch": 0.202875, "grad_norm": 2.953125, "grad_norm_var": 0.016569010416666665, "learning_rate": 0.0001, "loss": 8.1921, "loss/crossentropy": 2.494389772415161, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27728429436683655, "step": 3246 }, { "epoch": 0.203, "grad_norm": 2.96875, "grad_norm_var": 0.0160552978515625, "learning_rate": 0.0001, "loss": 8.2538, "loss/crossentropy": 2.2659478187561035, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2834864854812622, "step": 3248 }, { "epoch": 0.203125, "grad_norm": 2.96875, "grad_norm_var": 0.015067545572916667, "learning_rate": 0.0001, "loss": 8.3668, "loss/crossentropy": 2.5000627040863037, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27600064873695374, "step": 3250 }, { "epoch": 0.20325, "grad_norm": 2.984375, "grad_norm_var": 0.011295572916666666, "learning_rate": 0.0001, "loss": 8.2863, "loss/crossentropy": 2.4377931356430054, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2744172215461731, "step": 3252 }, { "epoch": 0.203375, "grad_norm": 3.140625, "grad_norm_var": 0.011474609375, "learning_rate": 0.0001, "loss": 8.0717, "loss/crossentropy": 2.045997738838196, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2477242648601532, "step": 3254 }, { "epoch": 0.2035, "grad_norm": 3.1875, "grad_norm_var": 0.017552693684895832, "learning_rate": 0.0001, "loss": 8.2171, "loss/crossentropy": 2.2741061449050903, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2541493773460388, "step": 3256 }, { "epoch": 0.203625, "grad_norm": 2.796875, "grad_norm_var": 0.0221588134765625, "learning_rate": 0.0001, "loss": 8.0597, "loss/crossentropy": 2.317218542098999, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26807165145874023, "step": 3258 }, { "epoch": 0.20375, "grad_norm": 2.828125, "grad_norm_var": 0.027074178059895832, "learning_rate": 0.0001, "loss": 8.1394, "loss/crossentropy": 2.196931838989258, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2481948360800743, "step": 3260 }, { "epoch": 0.203875, "grad_norm": 2.984375, "grad_norm_var": 0.0280670166015625, "learning_rate": 0.0001, "loss": 8.2271, "loss/crossentropy": 2.6477065086364746, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2857190817594528, "step": 3262 }, { "epoch": 0.204, "grad_norm": 3.203125, "grad_norm_var": 0.032892862955729164, "learning_rate": 0.0001, "loss": 8.1574, "loss/crossentropy": 2.189841389656067, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2925741523504257, "step": 3264 }, { "epoch": 0.204125, "grad_norm": 3.0, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 7.9733, "loss/crossentropy": 2.350952982902527, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2603686600923538, "step": 3266 }, { "epoch": 0.20425, "grad_norm": 2.890625, "grad_norm_var": 0.0315338134765625, "learning_rate": 0.0001, "loss": 8.1512, "loss/crossentropy": 2.661333441734314, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26404714584350586, "step": 3268 }, { "epoch": 0.204375, "grad_norm": 3.234375, "grad_norm_var": 0.0362701416015625, "learning_rate": 0.0001, "loss": 8.0497, "loss/crossentropy": 2.4684576988220215, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25150536000728607, "step": 3270 }, { "epoch": 0.2045, "grad_norm": 3.015625, "grad_norm_var": 0.0236236572265625, "learning_rate": 0.0001, "loss": 8.0187, "loss/crossentropy": 2.1839378476142883, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26790380477905273, "step": 3272 }, { "epoch": 0.204625, "grad_norm": 2.859375, "grad_norm_var": 0.024421183268229167, "learning_rate": 0.0001, "loss": 7.948, "loss/crossentropy": 2.191369652748108, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25659455358982086, "step": 3274 }, { "epoch": 0.20475, "grad_norm": 3.375, "grad_norm_var": 0.034749348958333336, "learning_rate": 0.0001, "loss": 7.9327, "loss/crossentropy": 2.1985403299331665, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27150973677635193, "step": 3276 }, { "epoch": 0.204875, "grad_norm": 2.96875, "grad_norm_var": 0.03560791015625, "learning_rate": 0.0001, "loss": 8.1169, "loss/crossentropy": 2.2231366634368896, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26266512274742126, "step": 3278 }, { "epoch": 0.205, "grad_norm": 2.96875, "grad_norm_var": 0.0292633056640625, "learning_rate": 0.0001, "loss": 8.1368, "loss/crossentropy": 2.1812610626220703, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25296615064144135, "step": 3280 }, { "epoch": 0.205125, "grad_norm": 3.125, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 8.0905, "loss/crossentropy": 2.2935560941696167, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24997325241565704, "step": 3282 }, { "epoch": 0.20525, "grad_norm": 2.96875, "grad_norm_var": 0.035791015625, "learning_rate": 0.0001, "loss": 7.9969, "loss/crossentropy": 2.0980414152145386, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2588716074824333, "step": 3284 }, { "epoch": 0.205375, "grad_norm": 2.828125, "grad_norm_var": 0.037109375, "learning_rate": 0.0001, "loss": 7.9273, "loss/crossentropy": 2.116236090660095, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25013136863708496, "step": 3286 }, { "epoch": 0.2055, "grad_norm": 2.96875, "grad_norm_var": 0.03726806640625, "learning_rate": 0.0001, "loss": 8.0264, "loss/crossentropy": 2.4083213806152344, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27061036229133606, "step": 3288 }, { "epoch": 0.205625, "grad_norm": 3.234375, "grad_norm_var": 0.03951822916666667, "learning_rate": 0.0001, "loss": 8.0898, "loss/crossentropy": 2.4575644731521606, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2591339647769928, "step": 3290 }, { "epoch": 0.20575, "grad_norm": 2.953125, "grad_norm_var": 0.028385416666666666, "learning_rate": 0.0001, "loss": 8.0784, "loss/crossentropy": 2.386906147003174, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2638823390007019, "step": 3292 }, { "epoch": 0.205875, "grad_norm": 3.03125, "grad_norm_var": 0.02633056640625, "learning_rate": 0.0001, "loss": 8.0757, "loss/crossentropy": 2.2036038637161255, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24547383934259415, "step": 3294 }, { "epoch": 0.206, "grad_norm": 2.78125, "grad_norm_var": 0.0339508056640625, "learning_rate": 0.0001, "loss": 7.9217, "loss/crossentropy": 2.3093923330307007, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.28410804271698, "step": 3296 }, { "epoch": 0.206125, "grad_norm": 2.796875, "grad_norm_var": 0.0346832275390625, "learning_rate": 0.0001, "loss": 8.0937, "loss/crossentropy": 2.3748456239700317, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26681068539619446, "step": 3298 }, { "epoch": 0.20625, "grad_norm": 2.875, "grad_norm_var": 0.030712890625, "learning_rate": 0.0001, "loss": 8.2334, "loss/crossentropy": 2.5478373765945435, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2881655991077423, "step": 3300 }, { "epoch": 0.206375, "grad_norm": 2.96875, "grad_norm_var": 0.026984659830729167, "learning_rate": 0.0001, "loss": 8.1905, "loss/crossentropy": 2.2184265851974487, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26013314723968506, "step": 3302 }, { "epoch": 0.2065, "grad_norm": 2.96875, "grad_norm_var": 0.028597005208333335, "learning_rate": 0.0001, "loss": 8.165, "loss/crossentropy": 2.2159677743911743, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2637548893690109, "step": 3304 }, { "epoch": 0.206625, "grad_norm": 3.078125, "grad_norm_var": 0.0255035400390625, "learning_rate": 0.0001, "loss": 8.2577, "loss/crossentropy": 2.555700659751892, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28978103399276733, "step": 3306 }, { "epoch": 0.20675, "grad_norm": 2.9375, "grad_norm_var": 0.024657185872395834, "learning_rate": 0.0001, "loss": 8.0698, "loss/crossentropy": 2.1323585510253906, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.27381235361099243, "step": 3308 }, { "epoch": 0.206875, "grad_norm": 2.953125, "grad_norm_var": 0.02353515625, "learning_rate": 0.0001, "loss": 8.2182, "loss/crossentropy": 2.354387640953064, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27996116876602173, "step": 3310 }, { "epoch": 0.207, "grad_norm": 2.875, "grad_norm_var": 0.015729777018229165, "learning_rate": 0.0001, "loss": 8.2811, "loss/crossentropy": 2.5515745878219604, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2727746069431305, "step": 3312 }, { "epoch": 0.207125, "grad_norm": 2.90625, "grad_norm_var": 0.01539306640625, "learning_rate": 0.0001, "loss": 8.0698, "loss/crossentropy": 2.4017679691314697, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24894237518310547, "step": 3314 }, { "epoch": 0.20725, "grad_norm": 3.078125, "grad_norm_var": 0.016597493489583334, "learning_rate": 0.0001, "loss": 8.2728, "loss/crossentropy": 2.1459991931915283, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2560963034629822, "step": 3316 }, { "epoch": 0.207375, "grad_norm": 2.78125, "grad_norm_var": 0.015397135416666667, "learning_rate": 0.0001, "loss": 7.9846, "loss/crossentropy": 2.299985408782959, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26821835339069366, "step": 3318 }, { "epoch": 0.2075, "grad_norm": 2.734375, "grad_norm_var": 0.020173136393229166, "learning_rate": 0.0001, "loss": 7.7324, "loss/crossentropy": 2.0270864367485046, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24157769232988358, "step": 3320 }, { "epoch": 0.207625, "grad_norm": 2.96875, "grad_norm_var": 0.014676920572916667, "learning_rate": 0.0001, "loss": 8.0535, "loss/crossentropy": 2.2521005868911743, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24373552203178406, "step": 3322 }, { "epoch": 0.20775, "grad_norm": 2.921875, "grad_norm_var": 0.014925130208333333, "learning_rate": 0.0001, "loss": 8.0202, "loss/crossentropy": 2.2878029346466064, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25381772220134735, "step": 3324 }, { "epoch": 0.207875, "grad_norm": 2.9375, "grad_norm_var": 0.014826456705729166, "learning_rate": 0.0001, "loss": 7.96, "loss/crossentropy": 2.2602121829986572, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26364803314208984, "step": 3326 }, { "epoch": 0.208, "grad_norm": 2.890625, "grad_norm_var": 0.01470947265625, "learning_rate": 0.0001, "loss": 8.0367, "loss/crossentropy": 2.48467481136322, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24693025648593903, "step": 3328 }, { "epoch": 0.208125, "grad_norm": 3.421875, "grad_norm_var": 0.0357421875, "learning_rate": 0.0001, "loss": 8.2808, "loss/crossentropy": 2.243489623069763, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28045617043972015, "step": 3330 }, { "epoch": 0.20825, "grad_norm": 3.015625, "grad_norm_var": 0.0317779541015625, "learning_rate": 0.0001, "loss": 8.2617, "loss/crossentropy": 2.4428763389587402, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27135811746120453, "step": 3332 }, { "epoch": 0.208375, "grad_norm": 2.765625, "grad_norm_var": 0.0314453125, "learning_rate": 0.0001, "loss": 8.1175, "loss/crossentropy": 2.453363060951233, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2776189148426056, "step": 3334 }, { "epoch": 0.2085, "grad_norm": 2.96875, "grad_norm_var": 0.0255859375, "learning_rate": 0.0001, "loss": 8.1894, "loss/crossentropy": 2.3482531309127808, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2623911052942276, "step": 3336 }, { "epoch": 0.208625, "grad_norm": 3.546875, "grad_norm_var": 0.047638956705729166, "learning_rate": 0.0001, "loss": 8.2654, "loss/crossentropy": 2.3092031478881836, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28854168951511383, "step": 3338 }, { "epoch": 0.20875, "grad_norm": 2.9375, "grad_norm_var": 0.0513824462890625, "learning_rate": 0.0001, "loss": 8.0428, "loss/crossentropy": 2.1508899927139282, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2546079680323601, "step": 3340 }, { "epoch": 0.208875, "grad_norm": 2.890625, "grad_norm_var": 0.050732421875, "learning_rate": 0.0001, "loss": 8.2438, "loss/crossentropy": 2.3433796167373657, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26718689501285553, "step": 3342 }, { "epoch": 0.209, "grad_norm": 3.015625, "grad_norm_var": 0.048127237955729166, "learning_rate": 0.0001, "loss": 7.8111, "loss/crossentropy": 2.229587435722351, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25351928174495697, "step": 3344 }, { "epoch": 0.209125, "grad_norm": 3.03125, "grad_norm_var": 0.03713785807291667, "learning_rate": 0.0001, "loss": 8.0189, "loss/crossentropy": 2.3170838356018066, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2664952799677849, "step": 3346 }, { "epoch": 0.20925, "grad_norm": 3.125, "grad_norm_var": 0.038899739583333336, "learning_rate": 0.0001, "loss": 8.2421, "loss/crossentropy": 2.2571229934692383, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.270868718624115, "step": 3348 }, { "epoch": 0.209375, "grad_norm": 2.9375, "grad_norm_var": 0.0376129150390625, "learning_rate": 0.0001, "loss": 8.1379, "loss/crossentropy": 2.4776333570480347, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25770139694213867, "step": 3350 }, { "epoch": 0.2095, "grad_norm": 2.875, "grad_norm_var": 0.0370269775390625, "learning_rate": 0.0001, "loss": 8.1088, "loss/crossentropy": 2.157105803489685, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26475150883197784, "step": 3352 }, { "epoch": 0.209625, "grad_norm": 2.953125, "grad_norm_var": 0.037495930989583336, "learning_rate": 0.0001, "loss": 8.2263, "loss/crossentropy": 2.4708096981048584, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27709396183490753, "step": 3354 }, { "epoch": 0.20975, "grad_norm": 2.84375, "grad_norm_var": 0.035863240559895836, "learning_rate": 0.0001, "loss": 8.0677, "loss/crossentropy": 2.376826286315918, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2647465467453003, "step": 3356 }, { "epoch": 0.209875, "grad_norm": 2.9375, "grad_norm_var": 0.03557535807291667, "learning_rate": 0.0001, "loss": 8.1983, "loss/crossentropy": 2.3289496898651123, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2608029991388321, "step": 3358 }, { "epoch": 0.21, "grad_norm": 2.75, "grad_norm_var": 0.04039306640625, "learning_rate": 0.0001, "loss": 8.0714, "loss/crossentropy": 2.345235228538513, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26208456605672836, "step": 3360 }, { "epoch": 0.210125, "grad_norm": 2.875, "grad_norm_var": 0.038483683268229166, "learning_rate": 0.0001, "loss": 8.1277, "loss/crossentropy": 2.259727954864502, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2612561881542206, "step": 3362 }, { "epoch": 0.21025, "grad_norm": 2.828125, "grad_norm_var": 0.04000244140625, "learning_rate": 0.0001, "loss": 8.0557, "loss/crossentropy": 2.2856264114379883, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2503708600997925, "step": 3364 }, { "epoch": 0.210375, "grad_norm": 2.84375, "grad_norm_var": 0.03703511555989583, "learning_rate": 0.0001, "loss": 7.9275, "loss/crossentropy": 2.3697084188461304, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24406228959560394, "step": 3366 }, { "epoch": 0.2105, "grad_norm": 2.890625, "grad_norm_var": 0.0388092041015625, "learning_rate": 0.0001, "loss": 7.9977, "loss/crossentropy": 2.3035097122192383, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25051476061344147, "step": 3368 }, { "epoch": 0.210625, "grad_norm": 2.96875, "grad_norm_var": 0.01090087890625, "learning_rate": 0.0001, "loss": 8.2405, "loss/crossentropy": 2.349913001060486, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.3001957833766937, "step": 3370 }, { "epoch": 0.21075, "grad_norm": 3.09375, "grad_norm_var": 0.0122955322265625, "learning_rate": 0.0001, "loss": 8.223, "loss/crossentropy": 2.424271821975708, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2743752747774124, "step": 3372 }, { "epoch": 0.210875, "grad_norm": 2.765625, "grad_norm_var": 0.013798014322916666, "learning_rate": 0.0001, "loss": 8.0953, "loss/crossentropy": 2.3287577629089355, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.27096617221832275, "step": 3374 }, { "epoch": 0.211, "grad_norm": 3.171875, "grad_norm_var": 0.016901652018229168, "learning_rate": 0.0001, "loss": 8.1759, "loss/crossentropy": 2.4276020526885986, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25759419798851013, "step": 3376 }, { "epoch": 0.211125, "grad_norm": 2.90625, "grad_norm_var": 0.015265909830729167, "learning_rate": 0.0001, "loss": 7.8948, "loss/crossentropy": 2.512119174003601, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24548564851284027, "step": 3378 }, { "epoch": 0.21125, "grad_norm": 2.734375, "grad_norm_var": 0.014411417643229167, "learning_rate": 0.0001, "loss": 8.1024, "loss/crossentropy": 2.285138726234436, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2538295388221741, "step": 3380 }, { "epoch": 0.211375, "grad_norm": 2.765625, "grad_norm_var": 0.020067342122395835, "learning_rate": 0.0001, "loss": 8.3056, "loss/crossentropy": 2.543992042541504, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26885660737752914, "step": 3382 }, { "epoch": 0.2115, "grad_norm": 3.328125, "grad_norm_var": 0.02701416015625, "learning_rate": 0.0001, "loss": 7.9994, "loss/crossentropy": 2.5173155069351196, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26409924030303955, "step": 3384 }, { "epoch": 0.211625, "grad_norm": 2.828125, "grad_norm_var": 0.029157511393229165, "learning_rate": 0.0001, "loss": 8.0691, "loss/crossentropy": 2.1601486206054688, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24972151964902878, "step": 3386 }, { "epoch": 0.21175, "grad_norm": 3.109375, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 7.9928, "loss/crossentropy": 2.074127435684204, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2501622885465622, "step": 3388 }, { "epoch": 0.211875, "grad_norm": 3.09375, "grad_norm_var": 0.02877197265625, "learning_rate": 0.0001, "loss": 8.0324, "loss/crossentropy": 2.1279070377349854, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24174269288778305, "step": 3390 }, { "epoch": 0.212, "grad_norm": 2.765625, "grad_norm_var": 0.0286529541015625, "learning_rate": 0.0001, "loss": 8.2032, "loss/crossentropy": 2.4869847297668457, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26013004779815674, "step": 3392 }, { "epoch": 0.212125, "grad_norm": 2.84375, "grad_norm_var": 0.04604390462239583, "learning_rate": 0.0001, "loss": 8.2452, "loss/crossentropy": 2.279478073120117, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25594570487737656, "step": 3394 }, { "epoch": 0.21225, "grad_norm": 3.09375, "grad_norm_var": 0.044408162434895836, "learning_rate": 0.0001, "loss": 8.1751, "loss/crossentropy": 2.32720410823822, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2753504514694214, "step": 3396 }, { "epoch": 0.212375, "grad_norm": 2.890625, "grad_norm_var": 0.039159138997395836, "learning_rate": 0.0001, "loss": 8.0944, "loss/crossentropy": 2.517144560813904, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2721696197986603, "step": 3398 }, { "epoch": 0.2125, "grad_norm": 3.09375, "grad_norm_var": 0.0320465087890625, "learning_rate": 0.0001, "loss": 8.204, "loss/crossentropy": 2.1578654050827026, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2626515179872513, "step": 3400 }, { "epoch": 0.212625, "grad_norm": 2.84375, "grad_norm_var": 0.030663045247395833, "learning_rate": 0.0001, "loss": 8.0256, "loss/crossentropy": 2.3374911546707153, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2765570878982544, "step": 3402 }, { "epoch": 0.21275, "grad_norm": 2.796875, "grad_norm_var": 0.03200581868489583, "learning_rate": 0.0001, "loss": 7.8734, "loss/crossentropy": 2.1568061113357544, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23823237419128418, "step": 3404 }, { "epoch": 0.212875, "grad_norm": 2.828125, "grad_norm_var": 0.032548014322916666, "learning_rate": 0.0001, "loss": 8.0153, "loss/crossentropy": 2.302879214286804, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2629336416721344, "step": 3406 }, { "epoch": 0.213, "grad_norm": 3.0, "grad_norm_var": 0.03235270182291667, "learning_rate": 0.0001, "loss": 8.0735, "loss/crossentropy": 2.365081310272217, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2782520353794098, "step": 3408 }, { "epoch": 0.213125, "grad_norm": 2.9375, "grad_norm_var": 0.013667805989583334, "learning_rate": 0.0001, "loss": 8.1104, "loss/crossentropy": 2.244347095489502, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24217111617326736, "step": 3410 }, { "epoch": 0.21325, "grad_norm": 2.984375, "grad_norm_var": 0.011937459309895834, "learning_rate": 0.0001, "loss": 8.1616, "loss/crossentropy": 2.3036710023880005, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.24564654380083084, "step": 3412 }, { "epoch": 0.213375, "grad_norm": 3.015625, "grad_norm_var": 0.012043253580729166, "learning_rate": 0.0001, "loss": 7.9978, "loss/crossentropy": 2.301898717880249, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23626847565174103, "step": 3414 }, { "epoch": 0.2135, "grad_norm": 2.84375, "grad_norm_var": 0.0104156494140625, "learning_rate": 0.0001, "loss": 7.7726, "loss/crossentropy": 1.9760905504226685, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2343737632036209, "step": 3416 }, { "epoch": 0.213625, "grad_norm": 2.734375, "grad_norm_var": 0.012333170572916666, "learning_rate": 0.0001, "loss": 7.9653, "loss/crossentropy": 2.24316668510437, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26175426691770554, "step": 3418 }, { "epoch": 0.21375, "grad_norm": 2.890625, "grad_norm_var": 0.0106842041015625, "learning_rate": 0.0001, "loss": 7.8672, "loss/crossentropy": 2.237221360206604, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.26167693734169006, "step": 3420 }, { "epoch": 0.213875, "grad_norm": 3.1875, "grad_norm_var": 0.014518229166666667, "learning_rate": 0.0001, "loss": 8.1044, "loss/crossentropy": 2.649104952812195, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28041093051433563, "step": 3422 }, { "epoch": 0.214, "grad_norm": 2.703125, "grad_norm_var": 0.017552693684895832, "learning_rate": 0.0001, "loss": 7.7207, "loss/crossentropy": 2.257661819458008, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25010478496551514, "step": 3424 }, { "epoch": 0.214125, "grad_norm": 3.125, "grad_norm_var": 0.020213826497395834, "learning_rate": 0.0001, "loss": 8.1403, "loss/crossentropy": 2.2480964064598083, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25986164808273315, "step": 3426 }, { "epoch": 0.21425, "grad_norm": 2.6875, "grad_norm_var": 0.022391764322916667, "learning_rate": 0.0001, "loss": 7.915, "loss/crossentropy": 2.3254162073135376, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25972771644592285, "step": 3428 }, { "epoch": 0.214375, "grad_norm": 3.0, "grad_norm_var": 0.023111979166666668, "learning_rate": 0.0001, "loss": 7.9769, "loss/crossentropy": 2.129208207130432, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24452001601457596, "step": 3430 }, { "epoch": 0.2145, "grad_norm": 2.71875, "grad_norm_var": 0.024779256184895834, "learning_rate": 0.0001, "loss": 7.8228, "loss/crossentropy": 2.2781134843826294, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24319136142730713, "step": 3432 }, { "epoch": 0.214625, "grad_norm": 2.890625, "grad_norm_var": 0.0223785400390625, "learning_rate": 0.0001, "loss": 8.1741, "loss/crossentropy": 2.2903696298599243, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2773251011967659, "step": 3434 }, { "epoch": 0.21475, "grad_norm": 2.84375, "grad_norm_var": 0.022509765625, "learning_rate": 0.0001, "loss": 8.1499, "loss/crossentropy": 2.368951678276062, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26731523871421814, "step": 3436 }, { "epoch": 0.214875, "grad_norm": 3.25, "grad_norm_var": 0.026634724934895833, "learning_rate": 0.0001, "loss": 8.2168, "loss/crossentropy": 2.4735910892486572, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2662598043680191, "step": 3438 }, { "epoch": 0.215, "grad_norm": 2.875, "grad_norm_var": 0.022770182291666666, "learning_rate": 0.0001, "loss": 8.229, "loss/crossentropy": 2.4794031381607056, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2601815089583397, "step": 3440 }, { "epoch": 0.215125, "grad_norm": 2.9375, "grad_norm_var": 0.019124348958333332, "learning_rate": 0.0001, "loss": 8.1565, "loss/crossentropy": 2.5665329694747925, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2701922208070755, "step": 3442 }, { "epoch": 0.21525, "grad_norm": 3.03125, "grad_norm_var": 0.01568603515625, "learning_rate": 0.0001, "loss": 8.2036, "loss/crossentropy": 2.5179523229599, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25429390370845795, "step": 3444 }, { "epoch": 0.215375, "grad_norm": 2.921875, "grad_norm_var": 0.015217081705729166, "learning_rate": 0.0001, "loss": 8.1567, "loss/crossentropy": 2.416312336921692, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.28215354681015015, "step": 3446 }, { "epoch": 0.2155, "grad_norm": 2.8125, "grad_norm_var": 0.015070597330729166, "learning_rate": 0.0001, "loss": 8.1865, "loss/crossentropy": 2.6812586784362793, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2766028195619583, "step": 3448 }, { "epoch": 0.215625, "grad_norm": 2.859375, "grad_norm_var": 0.018485514322916667, "learning_rate": 0.0001, "loss": 8.1219, "loss/crossentropy": 2.3808157444000244, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28973422944545746, "step": 3450 }, { "epoch": 0.21575, "grad_norm": 3.03125, "grad_norm_var": 0.019331868489583334, "learning_rate": 0.0001, "loss": 8.2494, "loss/crossentropy": 2.3658465147018433, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23564206808805466, "step": 3452 }, { "epoch": 0.215875, "grad_norm": 3.03125, "grad_norm_var": 0.016292317708333334, "learning_rate": 0.0001, "loss": 8.0049, "loss/crossentropy": 2.420639157295227, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2846282720565796, "step": 3454 }, { "epoch": 0.216, "grad_norm": 2.765625, "grad_norm_var": 0.024592081705729168, "learning_rate": 0.0001, "loss": 8.2144, "loss/crossentropy": 2.3309932947158813, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25562766194343567, "step": 3456 }, { "epoch": 0.216125, "grad_norm": 3.046875, "grad_norm_var": 0.0251617431640625, "learning_rate": 0.0001, "loss": 7.777, "loss/crossentropy": 2.1230533719062805, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2454065978527069, "step": 3458 }, { "epoch": 0.21625, "grad_norm": 2.890625, "grad_norm_var": 0.025129191080729165, "learning_rate": 0.0001, "loss": 7.8995, "loss/crossentropy": 2.214825987815857, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2496349811553955, "step": 3460 }, { "epoch": 0.216375, "grad_norm": 3.09375, "grad_norm_var": 0.025861612955729165, "learning_rate": 0.0001, "loss": 8.1565, "loss/crossentropy": 2.5217314958572388, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2674577683210373, "step": 3462 }, { "epoch": 0.2165, "grad_norm": 2.734375, "grad_norm_var": 0.028343709309895833, "learning_rate": 0.0001, "loss": 8.1601, "loss/crossentropy": 2.164268136024475, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2537472993135452, "step": 3464 }, { "epoch": 0.216625, "grad_norm": 2.875, "grad_norm_var": 0.0252349853515625, "learning_rate": 0.0001, "loss": 8.0046, "loss/crossentropy": 2.4043670892715454, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25185615569353104, "step": 3466 }, { "epoch": 0.21675, "grad_norm": 2.84375, "grad_norm_var": 0.0302642822265625, "learning_rate": 0.0001, "loss": 7.6261, "loss/crossentropy": 2.0687937140464783, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23499736189842224, "step": 3468 }, { "epoch": 0.216875, "grad_norm": 3.015625, "grad_norm_var": 0.028205362955729167, "learning_rate": 0.0001, "loss": 7.869, "loss/crossentropy": 2.3043712377548218, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24541200697422028, "step": 3470 }, { "epoch": 0.217, "grad_norm": 3.0625, "grad_norm_var": 0.021637980143229166, "learning_rate": 0.0001, "loss": 8.1061, "loss/crossentropy": 2.347123861312866, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26428553462028503, "step": 3472 }, { "epoch": 0.217125, "grad_norm": 3.046875, "grad_norm_var": 0.021483357747395834, "learning_rate": 0.0001, "loss": 8.0869, "loss/crossentropy": 2.2847089767456055, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2628851383924484, "step": 3474 }, { "epoch": 0.21725, "grad_norm": 2.609375, "grad_norm_var": 0.027179972330729166, "learning_rate": 0.0001, "loss": 7.8983, "loss/crossentropy": 2.110204815864563, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2360912710428238, "step": 3476 }, { "epoch": 0.217375, "grad_norm": 3.15625, "grad_norm_var": 0.027213541666666667, "learning_rate": 0.0001, "loss": 8.1178, "loss/crossentropy": 2.3014683723449707, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25195126235485077, "step": 3478 }, { "epoch": 0.2175, "grad_norm": 3.09375, "grad_norm_var": 0.0236328125, "learning_rate": 0.0001, "loss": 8.0819, "loss/crossentropy": 2.4000685811042786, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25215842574834824, "step": 3480 }, { "epoch": 0.217625, "grad_norm": 2.75, "grad_norm_var": 0.028385416666666666, "learning_rate": 0.0001, "loss": 7.8973, "loss/crossentropy": 2.091042637825012, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24015626311302185, "step": 3482 }, { "epoch": 0.21775, "grad_norm": 2.890625, "grad_norm_var": 0.025553385416666668, "learning_rate": 0.0001, "loss": 8.0738, "loss/crossentropy": 2.3160455226898193, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26687709987163544, "step": 3484 }, { "epoch": 0.217875, "grad_norm": 2.953125, "grad_norm_var": 0.0260406494140625, "learning_rate": 0.0001, "loss": 8.2908, "loss/crossentropy": 2.410978317260742, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27568891644477844, "step": 3486 }, { "epoch": 0.218, "grad_norm": 3.046875, "grad_norm_var": 0.02578125, "learning_rate": 0.0001, "loss": 8.3535, "loss/crossentropy": 2.2229000329971313, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2667589634656906, "step": 3488 }, { "epoch": 0.218125, "grad_norm": 3.03125, "grad_norm_var": 0.02955322265625, "learning_rate": 0.0001, "loss": 8.1352, "loss/crossentropy": 2.2095683217048645, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26982176303863525, "step": 3490 }, { "epoch": 0.21825, "grad_norm": 2.84375, "grad_norm_var": 0.0224273681640625, "learning_rate": 0.0001, "loss": 7.9164, "loss/crossentropy": 2.281849980354309, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25118981301784515, "step": 3492 }, { "epoch": 0.218375, "grad_norm": 3.140625, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 7.9892, "loss/crossentropy": 2.229013442993164, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2572949305176735, "step": 3494 }, { "epoch": 0.2185, "grad_norm": 2.859375, "grad_norm_var": 0.022614542643229166, "learning_rate": 0.0001, "loss": 8.2191, "loss/crossentropy": 2.1624966859817505, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2592136114835739, "step": 3496 }, { "epoch": 0.218625, "grad_norm": 2.8125, "grad_norm_var": 0.019343058268229168, "learning_rate": 0.0001, "loss": 8.1435, "loss/crossentropy": 2.446608304977417, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25450390577316284, "step": 3498 }, { "epoch": 0.21875, "grad_norm": 2.859375, "grad_norm_var": 0.0186920166015625, "learning_rate": 0.0001, "loss": 7.7994, "loss/crossentropy": 2.082264542579651, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23312175273895264, "step": 3500 }, { "epoch": 0.218875, "grad_norm": 3.171875, "grad_norm_var": 0.024347941080729168, "learning_rate": 0.0001, "loss": 7.9793, "loss/crossentropy": 2.3270751237869263, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24587048590183258, "step": 3502 }, { "epoch": 0.219, "grad_norm": 2.9375, "grad_norm_var": 0.023270670572916666, "learning_rate": 0.0001, "loss": 8.2429, "loss/crossentropy": 2.5829883813858032, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2657495513558388, "step": 3504 }, { "epoch": 0.219125, "grad_norm": 2.859375, "grad_norm_var": 0.023786417643229165, "learning_rate": 0.0001, "loss": 7.9588, "loss/crossentropy": 2.324189066886902, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2597867324948311, "step": 3506 }, { "epoch": 0.21925, "grad_norm": 2.96875, "grad_norm_var": 0.02359619140625, "learning_rate": 0.0001, "loss": 8.0548, "loss/crossentropy": 2.2501312494277954, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2503369599580765, "step": 3508 }, { "epoch": 0.219375, "grad_norm": 2.9375, "grad_norm_var": 0.0199371337890625, "learning_rate": 0.0001, "loss": 8.0606, "loss/crossentropy": 2.3427246809005737, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25106480717658997, "step": 3510 }, { "epoch": 0.2195, "grad_norm": 3.078125, "grad_norm_var": 0.019391886393229165, "learning_rate": 0.0001, "loss": 8.0356, "loss/crossentropy": 2.483175754547119, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27612486481666565, "step": 3512 }, { "epoch": 0.219625, "grad_norm": 2.75, "grad_norm_var": 0.022614542643229166, "learning_rate": 0.0001, "loss": 8.1657, "loss/crossentropy": 2.44490385055542, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27590498328208923, "step": 3514 }, { "epoch": 0.21975, "grad_norm": 2.921875, "grad_norm_var": 0.022005208333333335, "learning_rate": 0.0001, "loss": 8.068, "loss/crossentropy": 2.4559766054153442, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24495191872119904, "step": 3516 }, { "epoch": 0.219875, "grad_norm": 2.859375, "grad_norm_var": 0.014957682291666666, "learning_rate": 0.0001, "loss": 7.8848, "loss/crossentropy": 2.1902081966400146, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2486228197813034, "step": 3518 }, { "epoch": 0.22, "grad_norm": 2.8125, "grad_norm_var": 0.014253743489583333, "learning_rate": 0.0001, "loss": 8.0384, "loss/crossentropy": 2.4357227087020874, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.23928922414779663, "step": 3520 }, { "epoch": 0.220125, "grad_norm": 2.796875, "grad_norm_var": 0.010986328125, "learning_rate": 0.0001, "loss": 8.1229, "loss/crossentropy": 2.2222291231155396, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.27725693583488464, "step": 3522 }, { "epoch": 0.22025, "grad_norm": 3.046875, "grad_norm_var": 0.012116495768229167, "learning_rate": 0.0001, "loss": 7.9818, "loss/crossentropy": 2.5780692100524902, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2518724948167801, "step": 3524 }, { "epoch": 0.220375, "grad_norm": 3.03125, "grad_norm_var": 0.014045206705729167, "learning_rate": 0.0001, "loss": 8.0047, "loss/crossentropy": 2.57025945186615, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.262205608189106, "step": 3526 }, { "epoch": 0.2205, "grad_norm": 2.765625, "grad_norm_var": 0.013719685872395833, "learning_rate": 0.0001, "loss": 7.8127, "loss/crossentropy": 2.07043993473053, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23150594532489777, "step": 3528 }, { "epoch": 0.220625, "grad_norm": 3.046875, "grad_norm_var": 0.01549072265625, "learning_rate": 0.0001, "loss": 8.2683, "loss/crossentropy": 2.4276788234710693, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2603815943002701, "step": 3530 }, { "epoch": 0.22075, "grad_norm": 2.78125, "grad_norm_var": 0.0149078369140625, "learning_rate": 0.0001, "loss": 8.0993, "loss/crossentropy": 2.1250449419021606, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2265177145600319, "step": 3532 }, { "epoch": 0.220875, "grad_norm": 3.046875, "grad_norm_var": 0.01607666015625, "learning_rate": 0.0001, "loss": 8.0249, "loss/crossentropy": 2.3386144638061523, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.274929404258728, "step": 3534 }, { "epoch": 0.221, "grad_norm": 3.078125, "grad_norm_var": 0.018488566080729168, "learning_rate": 0.0001, "loss": 8.2108, "loss/crossentropy": 2.406868577003479, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2587934732437134, "step": 3536 }, { "epoch": 0.221125, "grad_norm": 2.859375, "grad_norm_var": 0.0168121337890625, "learning_rate": 0.0001, "loss": 8.0413, "loss/crossentropy": 2.4491130113601685, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23888002336025238, "step": 3538 }, { "epoch": 0.22125, "grad_norm": 2.6875, "grad_norm_var": 0.023893229166666665, "learning_rate": 0.0001, "loss": 8.123, "loss/crossentropy": 2.2878233194351196, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.265165239572525, "step": 3540 }, { "epoch": 0.221375, "grad_norm": 3.171875, "grad_norm_var": 0.028902180989583335, "learning_rate": 0.0001, "loss": 8.0388, "loss/crossentropy": 2.3898258209228516, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2545909136533737, "step": 3542 }, { "epoch": 0.2215, "grad_norm": 2.765625, "grad_norm_var": 0.027213541666666667, "learning_rate": 0.0001, "loss": 8.0257, "loss/crossentropy": 2.4284157752990723, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27379511296749115, "step": 3544 }, { "epoch": 0.221625, "grad_norm": 2.875, "grad_norm_var": 0.025031534830729167, "learning_rate": 0.0001, "loss": 7.9438, "loss/crossentropy": 2.2095457315444946, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.248259037733078, "step": 3546 }, { "epoch": 0.22175, "grad_norm": 2.953125, "grad_norm_var": 0.023758951822916666, "learning_rate": 0.0001, "loss": 7.9915, "loss/crossentropy": 2.4044177532196045, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25886303186416626, "step": 3548 }, { "epoch": 0.221875, "grad_norm": 2.625, "grad_norm_var": 0.0283203125, "learning_rate": 0.0001, "loss": 7.9139, "loss/crossentropy": 2.2689582109451294, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24590185284614563, "step": 3550 }, { "epoch": 0.222, "grad_norm": 2.765625, "grad_norm_var": 0.0251373291015625, "learning_rate": 0.0001, "loss": 7.8249, "loss/crossentropy": 2.3793697357177734, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2494877427816391, "step": 3552 }, { "epoch": 0.222125, "grad_norm": 2.921875, "grad_norm_var": 0.0253570556640625, "learning_rate": 0.0001, "loss": 8.0842, "loss/crossentropy": 2.204411268234253, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.240115225315094, "step": 3554 }, { "epoch": 0.22225, "grad_norm": 2.953125, "grad_norm_var": 0.020970662434895832, "learning_rate": 0.0001, "loss": 8.0583, "loss/crossentropy": 2.307512044906616, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.23759452998638153, "step": 3556 }, { "epoch": 0.222375, "grad_norm": 2.828125, "grad_norm_var": 0.044611612955729164, "learning_rate": 0.0001, "loss": 7.8452, "loss/crossentropy": 2.4004819989204407, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28139108419418335, "step": 3558 }, { "epoch": 0.2225, "grad_norm": 2.984375, "grad_norm_var": 0.0453125, "learning_rate": 0.0001, "loss": 7.9175, "loss/crossentropy": 2.321327328681946, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2644403874874115, "step": 3560 }, { "epoch": 0.222625, "grad_norm": 2.8125, "grad_norm_var": 0.0475738525390625, "learning_rate": 0.0001, "loss": 7.9732, "loss/crossentropy": 2.495920181274414, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2554667443037033, "step": 3562 }, { "epoch": 0.22275, "grad_norm": 3.3125, "grad_norm_var": 0.057291666666666664, "learning_rate": 0.0001, "loss": 8.1036, "loss/crossentropy": 2.382394790649414, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26405252516269684, "step": 3564 }, { "epoch": 0.222875, "grad_norm": 3.859375, "grad_norm_var": 0.11291910807291666, "learning_rate": 0.0001, "loss": 8.3047, "loss/crossentropy": 2.2396143674850464, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2523963898420334, "step": 3566 }, { "epoch": 0.223, "grad_norm": 2.875, "grad_norm_var": 0.09706624348958333, "learning_rate": 0.0001, "loss": 8.1758, "loss/crossentropy": 2.323571801185608, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24818182736635208, "step": 3568 }, { "epoch": 0.223125, "grad_norm": 2.96875, "grad_norm_var": 0.09746805826822917, "learning_rate": 0.0001, "loss": 8.2098, "loss/crossentropy": 2.2471351623535156, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2540801614522934, "step": 3570 }, { "epoch": 0.22325, "grad_norm": 2.84375, "grad_norm_var": 0.1020904541015625, "learning_rate": 0.0001, "loss": 8.1846, "loss/crossentropy": 2.622172474861145, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2584248036146164, "step": 3572 }, { "epoch": 0.223375, "grad_norm": 2.75, "grad_norm_var": 0.09426167805989584, "learning_rate": 0.0001, "loss": 8.0865, "loss/crossentropy": 2.5120197534561157, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25580984354019165, "step": 3574 }, { "epoch": 0.2235, "grad_norm": 2.75, "grad_norm_var": 0.09970296223958333, "learning_rate": 0.0001, "loss": 8.0353, "loss/crossentropy": 2.3647409677505493, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2629951015114784, "step": 3576 }, { "epoch": 0.223625, "grad_norm": 2.9375, "grad_norm_var": 0.0968170166015625, "learning_rate": 0.0001, "loss": 8.0964, "loss/crossentropy": 2.225765824317932, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.27005481719970703, "step": 3578 }, { "epoch": 0.22375, "grad_norm": 2.78125, "grad_norm_var": 0.0942047119140625, "learning_rate": 0.0001, "loss": 7.9496, "loss/crossentropy": 2.172963261604309, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2738679349422455, "step": 3580 }, { "epoch": 0.223875, "grad_norm": 3.0, "grad_norm_var": 0.012776692708333334, "learning_rate": 0.0001, "loss": 8.1431, "loss/crossentropy": 2.360185146331787, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2576560080051422, "step": 3582 }, { "epoch": 0.224, "grad_norm": 3.03125, "grad_norm_var": 0.010477701822916666, "learning_rate": 0.0001, "loss": 7.9726, "loss/crossentropy": 2.034193217754364, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24234963953495026, "step": 3584 }, { "epoch": 0.224125, "grad_norm": 2.890625, "grad_norm_var": 0.010640462239583334, "learning_rate": 0.0001, "loss": 7.6952, "loss/crossentropy": 2.266782283782959, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24584876000881195, "step": 3586 }, { "epoch": 0.22425, "grad_norm": 3.515625, "grad_norm_var": 1.381476847330729, "learning_rate": 0.0001, "loss": 8.4263, "loss/crossentropy": 2.3165758848190308, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23391347378492355, "step": 3588 }, { "epoch": 0.224375, "grad_norm": 3.4375, "grad_norm_var": 1.3608876546223958, "learning_rate": 0.0001, "loss": 8.1828, "loss/crossentropy": 2.5059953927993774, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.28126345574855804, "step": 3590 }, { "epoch": 0.2245, "grad_norm": 3.03125, "grad_norm_var": 1.338703409830729, "learning_rate": 0.0001, "loss": 8.1214, "loss/crossentropy": 2.365064263343811, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27421511709690094, "step": 3592 }, { "epoch": 0.224625, "grad_norm": 2.90625, "grad_norm_var": 1.3280019124348958, "learning_rate": 0.0001, "loss": 8.0988, "loss/crossentropy": 2.4384506940841675, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2522090822458267, "step": 3594 }, { "epoch": 0.22475, "grad_norm": 2.96875, "grad_norm_var": 1.304955037434896, "learning_rate": 0.0001, "loss": 8.1116, "loss/crossentropy": 2.1527076959609985, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24413420259952545, "step": 3596 }, { "epoch": 0.224875, "grad_norm": 3.140625, "grad_norm_var": 1.301513671875, "learning_rate": 0.0001, "loss": 8.1075, "loss/crossentropy": 2.2478511333465576, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2579783499240875, "step": 3598 }, { "epoch": 0.225, "grad_norm": 3.046875, "grad_norm_var": 1.307331339518229, "learning_rate": 0.0001, "loss": 7.9146, "loss/crossentropy": 2.396006226539612, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24864860624074936, "step": 3600 }, { "epoch": 0.225125, "grad_norm": 2.875, "grad_norm_var": 1.31002197265625, "learning_rate": 0.0001, "loss": 8.2544, "loss/crossentropy": 2.6450746059417725, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2689923644065857, "step": 3602 }, { "epoch": 0.22525, "grad_norm": 2.75, "grad_norm_var": 0.05621744791666667, "learning_rate": 0.0001, "loss": 7.7334, "loss/crossentropy": 2.1939942836761475, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23875074833631516, "step": 3604 }, { "epoch": 0.225375, "grad_norm": 2.78125, "grad_norm_var": 0.013114420572916667, "learning_rate": 0.0001, "loss": 7.9962, "loss/crossentropy": 2.20789635181427, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25146640837192535, "step": 3606 }, { "epoch": 0.2255, "grad_norm": 2.953125, "grad_norm_var": 0.011832682291666667, "learning_rate": 0.0001, "loss": 8.0575, "loss/crossentropy": 2.2489298582077026, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24663664400577545, "step": 3608 }, { "epoch": 0.225625, "grad_norm": 3.078125, "grad_norm_var": 0.0134674072265625, "learning_rate": 0.0001, "loss": 8.0739, "loss/crossentropy": 2.153047263622284, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25803135335445404, "step": 3610 }, { "epoch": 0.22575, "grad_norm": 3.0625, "grad_norm_var": 0.0144683837890625, "learning_rate": 0.0001, "loss": 8.231, "loss/crossentropy": 2.17412793636322, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2347177267074585, "step": 3612 }, { "epoch": 0.225875, "grad_norm": 2.96875, "grad_norm_var": 0.011263020833333333, "learning_rate": 0.0001, "loss": 8.0072, "loss/crossentropy": 2.55859112739563, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25972864031791687, "step": 3614 }, { "epoch": 0.226, "grad_norm": 2.796875, "grad_norm_var": 0.011714680989583334, "learning_rate": 0.0001, "loss": 8.0068, "loss/crossentropy": 2.537785530090332, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2541259229183197, "step": 3616 }, { "epoch": 0.226125, "grad_norm": 2.953125, "grad_norm_var": 0.0119140625, "learning_rate": 0.0001, "loss": 7.9701, "loss/crossentropy": 2.1381853818893433, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2583460807800293, "step": 3618 }, { "epoch": 0.22625, "grad_norm": 2.859375, "grad_norm_var": 0.010383097330729167, "learning_rate": 0.0001, "loss": 8.0889, "loss/crossentropy": 2.267898917198181, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26800353825092316, "step": 3620 }, { "epoch": 0.226375, "grad_norm": 2.921875, "grad_norm_var": 0.009147135416666667, "learning_rate": 0.0001, "loss": 8.0278, "loss/crossentropy": 2.291195571422577, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26078518480062485, "step": 3622 }, { "epoch": 0.2265, "grad_norm": 2.890625, "grad_norm_var": 0.0090484619140625, "learning_rate": 0.0001, "loss": 7.7674, "loss/crossentropy": 2.1368112564086914, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23087257146835327, "step": 3624 }, { "epoch": 0.226625, "grad_norm": 2.9375, "grad_norm_var": 0.0071685791015625, "learning_rate": 0.0001, "loss": 8.2532, "loss/crossentropy": 2.495459794998169, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2500630244612694, "step": 3626 }, { "epoch": 0.22675, "grad_norm": 2.875, "grad_norm_var": 0.0052154541015625, "learning_rate": 0.0001, "loss": 7.9174, "loss/crossentropy": 2.3807636499404907, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2521054297685623, "step": 3628 }, { "epoch": 0.226875, "grad_norm": 2.984375, "grad_norm_var": 0.0088043212890625, "learning_rate": 0.0001, "loss": 8.0956, "loss/crossentropy": 2.2587956190109253, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2414519563317299, "step": 3630 }, { "epoch": 0.227, "grad_norm": 2.859375, "grad_norm_var": 0.009357706705729166, "learning_rate": 0.0001, "loss": 8.0097, "loss/crossentropy": 2.202399969100952, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2805350720882416, "step": 3632 }, { "epoch": 0.227125, "grad_norm": 2.90625, "grad_norm_var": 0.00894775390625, "learning_rate": 0.0001, "loss": 7.9928, "loss/crossentropy": 2.351557970046997, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2543243542313576, "step": 3634 }, { "epoch": 0.22725, "grad_norm": 2.96875, "grad_norm_var": 0.008226521809895833, "learning_rate": 0.0001, "loss": 8.1574, "loss/crossentropy": 2.3034415245056152, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28570613265037537, "step": 3636 }, { "epoch": 0.227375, "grad_norm": 3.0625, "grad_norm_var": 0.009593709309895834, "learning_rate": 0.0001, "loss": 7.9509, "loss/crossentropy": 2.2518638372421265, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2769414782524109, "step": 3638 }, { "epoch": 0.2275, "grad_norm": 3.0625, "grad_norm_var": 0.0104156494140625, "learning_rate": 0.0001, "loss": 8.2742, "loss/crossentropy": 2.313628911972046, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2677566707134247, "step": 3640 }, { "epoch": 0.227625, "grad_norm": 2.8125, "grad_norm_var": 0.022614542643229166, "learning_rate": 0.0001, "loss": 8.0699, "loss/crossentropy": 2.4361445903778076, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2696637511253357, "step": 3642 }, { "epoch": 0.22775, "grad_norm": 2.65625, "grad_norm_var": 0.0240234375, "learning_rate": 0.0001, "loss": 7.9745, "loss/crossentropy": 2.3829265832901, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24858426302671432, "step": 3644 }, { "epoch": 0.227875, "grad_norm": 2.734375, "grad_norm_var": 0.0252838134765625, "learning_rate": 0.0001, "loss": 8.0501, "loss/crossentropy": 2.344617486000061, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2515474408864975, "step": 3646 }, { "epoch": 0.228, "grad_norm": 3.140625, "grad_norm_var": 0.02886962890625, "learning_rate": 0.0001, "loss": 8.1793, "loss/crossentropy": 2.376260995864868, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24280902743339539, "step": 3648 }, { "epoch": 0.228125, "grad_norm": 3.03125, "grad_norm_var": 0.03185221354166667, "learning_rate": 0.0001, "loss": 8.2511, "loss/crossentropy": 2.427824020385742, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27128610014915466, "step": 3650 }, { "epoch": 0.22825, "grad_norm": 3.109375, "grad_norm_var": 0.04291890462239583, "learning_rate": 0.0001, "loss": 8.229, "loss/crossentropy": 2.465716004371643, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2654089778661728, "step": 3652 }, { "epoch": 0.228375, "grad_norm": 2.609375, "grad_norm_var": 0.055231730143229164, "learning_rate": 0.0001, "loss": 7.7795, "loss/crossentropy": 2.081652522087097, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25261829793453217, "step": 3654 }, { "epoch": 0.2285, "grad_norm": 3.09375, "grad_norm_var": 0.05577799479166667, "learning_rate": 0.0001, "loss": 7.9502, "loss/crossentropy": 2.1460715532302856, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24947232007980347, "step": 3656 }, { "epoch": 0.228625, "grad_norm": 3.046875, "grad_norm_var": 0.047053019205729164, "learning_rate": 0.0001, "loss": 8.3344, "loss/crossentropy": 2.341508150100708, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2834539860486984, "step": 3658 }, { "epoch": 0.22875, "grad_norm": 3.234375, "grad_norm_var": 0.0469635009765625, "learning_rate": 0.0001, "loss": 8.0013, "loss/crossentropy": 2.135187327861786, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2561069577932358, "step": 3660 }, { "epoch": 0.228875, "grad_norm": 3.34375, "grad_norm_var": 0.30520426432291664, "learning_rate": 0.0001, "loss": 8.2324, "loss/crossentropy": 2.383564591407776, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.28082774579524994, "step": 3662 }, { "epoch": 0.229, "grad_norm": 2.71875, "grad_norm_var": 0.3198313395182292, "learning_rate": 0.0001, "loss": 8.081, "loss/crossentropy": 2.199310064315796, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2764912396669388, "step": 3664 }, { "epoch": 0.229125, "grad_norm": 3.078125, "grad_norm_var": 0.3196614583333333, "learning_rate": 0.0001, "loss": 8.0434, "loss/crossentropy": 2.3052173852920532, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.28200992941856384, "step": 3666 }, { "epoch": 0.22925, "grad_norm": 2.953125, "grad_norm_var": 0.32179361979166665, "learning_rate": 0.0001, "loss": 8.0128, "loss/crossentropy": 2.3882672786712646, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2689662426710129, "step": 3668 }, { "epoch": 0.229375, "grad_norm": 3.234375, "grad_norm_var": 0.3091949462890625, "learning_rate": 0.0001, "loss": 7.9856, "loss/crossentropy": 2.4546321630477905, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.25415413081645966, "step": 3670 }, { "epoch": 0.2295, "grad_norm": 2.84375, "grad_norm_var": 0.3122029622395833, "learning_rate": 0.0001, "loss": 8.0347, "loss/crossentropy": 2.3109859228134155, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2632336914539337, "step": 3672 }, { "epoch": 0.229625, "grad_norm": 2.921875, "grad_norm_var": 0.31685791015625, "learning_rate": 0.0001, "loss": 7.9744, "loss/crossentropy": 2.24290668964386, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24445384740829468, "step": 3674 }, { "epoch": 0.22975, "grad_norm": 2.953125, "grad_norm_var": 0.3194976806640625, "learning_rate": 0.0001, "loss": 8.0582, "loss/crossentropy": 2.6178494691848755, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2564311772584915, "step": 3676 }, { "epoch": 0.229875, "grad_norm": 3.109375, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 8.1775, "loss/crossentropy": 2.266401767730713, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26438479125499725, "step": 3678 }, { "epoch": 0.23, "grad_norm": 3.21875, "grad_norm_var": 0.0263580322265625, "learning_rate": 0.0001, "loss": 7.9564, "loss/crossentropy": 2.2778828144073486, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.249868243932724, "step": 3680 }, { "epoch": 0.230125, "grad_norm": 2.96875, "grad_norm_var": 0.03190816243489583, "learning_rate": 0.0001, "loss": 8.2025, "loss/crossentropy": 2.5149052143096924, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2597532272338867, "step": 3682 }, { "epoch": 0.23025, "grad_norm": 2.796875, "grad_norm_var": 0.034407552083333334, "learning_rate": 0.0001, "loss": 8.1936, "loss/crossentropy": 2.4017945528030396, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2589564323425293, "step": 3684 }, { "epoch": 0.230375, "grad_norm": 2.90625, "grad_norm_var": 0.024933878580729166, "learning_rate": 0.0001, "loss": 8.0723, "loss/crossentropy": 2.3127940893173218, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2643566429615021, "step": 3686 }, { "epoch": 0.2305, "grad_norm": 2.640625, "grad_norm_var": 0.032063802083333336, "learning_rate": 0.0001, "loss": 7.6895, "loss/crossentropy": 2.009396731853485, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2169792503118515, "step": 3688 }, { "epoch": 0.230625, "grad_norm": 2.90625, "grad_norm_var": 0.03504130045572917, "learning_rate": 0.0001, "loss": 8.242, "loss/crossentropy": 2.3510366678237915, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2538380026817322, "step": 3690 }, { "epoch": 0.23075, "grad_norm": 2.84375, "grad_norm_var": 0.030973307291666665, "learning_rate": 0.0001, "loss": 7.9667, "loss/crossentropy": 2.3450275659561157, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.26012004911899567, "step": 3692 }, { "epoch": 0.230875, "grad_norm": 2.75, "grad_norm_var": 0.03232014973958333, "learning_rate": 0.0001, "loss": 8.0063, "loss/crossentropy": 2.0265676975250244, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2440163493156433, "step": 3694 }, { "epoch": 0.231, "grad_norm": 3.25, "grad_norm_var": 0.041259765625, "learning_rate": 0.0001, "loss": 8.152, "loss/crossentropy": 2.259366512298584, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2548896074295044, "step": 3696 }, { "epoch": 0.231125, "grad_norm": 3.21875, "grad_norm_var": 0.037984212239583336, "learning_rate": 0.0001, "loss": 8.177, "loss/crossentropy": 2.4230661392211914, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25759340822696686, "step": 3698 }, { "epoch": 0.23125, "grad_norm": 2.828125, "grad_norm_var": 0.03795572916666667, "learning_rate": 0.0001, "loss": 8.1042, "loss/crossentropy": 2.4304851293563843, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25671106576919556, "step": 3700 }, { "epoch": 0.231375, "grad_norm": 3.109375, "grad_norm_var": 0.042756144205729166, "learning_rate": 0.0001, "loss": 8.1151, "loss/crossentropy": 2.601144790649414, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25495633482933044, "step": 3702 }, { "epoch": 0.2315, "grad_norm": 2.84375, "grad_norm_var": 0.03870035807291667, "learning_rate": 0.0001, "loss": 7.8518, "loss/crossentropy": 2.0991747975349426, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23651761561632156, "step": 3704 }, { "epoch": 0.231625, "grad_norm": 3.0, "grad_norm_var": 0.03697916666666667, "learning_rate": 0.0001, "loss": 8.0609, "loss/crossentropy": 2.3886858224868774, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.249774731695652, "step": 3706 }, { "epoch": 0.23175, "grad_norm": 2.875, "grad_norm_var": 0.03943684895833333, "learning_rate": 0.0001, "loss": 8.0872, "loss/crossentropy": 2.275684356689453, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2471064031124115, "step": 3708 }, { "epoch": 0.231875, "grad_norm": 2.84375, "grad_norm_var": 0.03299153645833333, "learning_rate": 0.0001, "loss": 7.9469, "loss/crossentropy": 2.490668535232544, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26126355677843094, "step": 3710 }, { "epoch": 0.232, "grad_norm": 2.734375, "grad_norm_var": 0.029857381184895834, "learning_rate": 0.0001, "loss": 7.9749, "loss/crossentropy": 2.5393855571746826, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24972229450941086, "step": 3712 }, { "epoch": 0.232125, "grad_norm": 2.828125, "grad_norm_var": 0.028229777018229166, "learning_rate": 0.0001, "loss": 8.0115, "loss/crossentropy": 2.453523635864258, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26304805278778076, "step": 3714 }, { "epoch": 0.23225, "grad_norm": 2.84375, "grad_norm_var": 0.028075154622395834, "learning_rate": 0.0001, "loss": 8.1984, "loss/crossentropy": 2.2866803407669067, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26558496057987213, "step": 3716 }, { "epoch": 0.232375, "grad_norm": 2.9375, "grad_norm_var": 0.022907511393229166, "learning_rate": 0.0001, "loss": 8.1454, "loss/crossentropy": 2.5315120220184326, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2710745185613632, "step": 3718 }, { "epoch": 0.2325, "grad_norm": 2.796875, "grad_norm_var": 0.020563761393229168, "learning_rate": 0.0001, "loss": 7.7971, "loss/crossentropy": 2.1747653484344482, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2295996993780136, "step": 3720 }, { "epoch": 0.232625, "grad_norm": 3.28125, "grad_norm_var": 0.026634724934895833, "learning_rate": 0.0001, "loss": 7.9727, "loss/crossentropy": 2.348886013031006, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2646675556898117, "step": 3722 }, { "epoch": 0.23275, "grad_norm": 2.5625, "grad_norm_var": 0.026871744791666666, "learning_rate": 0.0001, "loss": 7.5556, "loss/crossentropy": 1.9647228121757507, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22373639792203903, "step": 3724 }, { "epoch": 0.232875, "grad_norm": 2.78125, "grad_norm_var": 0.024762980143229165, "learning_rate": 0.0001, "loss": 7.9863, "loss/crossentropy": 2.328511357307434, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23512594401836395, "step": 3726 }, { "epoch": 0.233, "grad_norm": 2.859375, "grad_norm_var": 0.02340087890625, "learning_rate": 0.0001, "loss": 8.0379, "loss/crossentropy": 2.4145067930221558, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2505330890417099, "step": 3728 }, { "epoch": 0.233125, "grad_norm": 3.015625, "grad_norm_var": 0.024365234375, "learning_rate": 0.0001, "loss": 8.0389, "loss/crossentropy": 2.326350212097168, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.282063826918602, "step": 3730 }, { "epoch": 0.23325, "grad_norm": 3.046875, "grad_norm_var": 0.025047810872395833, "learning_rate": 0.0001, "loss": 8.0287, "loss/crossentropy": 2.3160377740859985, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25916415452957153, "step": 3732 }, { "epoch": 0.233375, "grad_norm": 2.78125, "grad_norm_var": 0.025666300455729166, "learning_rate": 0.0001, "loss": 8.0978, "loss/crossentropy": 2.5563199520111084, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2803049683570862, "step": 3734 }, { "epoch": 0.2335, "grad_norm": 3.0625, "grad_norm_var": 0.02427978515625, "learning_rate": 0.0001, "loss": 7.9286, "loss/crossentropy": 2.1742677688598633, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25293681770563126, "step": 3736 }, { "epoch": 0.233625, "grad_norm": 2.890625, "grad_norm_var": 0.013776652018229167, "learning_rate": 0.0001, "loss": 8.1056, "loss/crossentropy": 2.3429336547851562, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2538968622684479, "step": 3738 }, { "epoch": 0.23375, "grad_norm": 2.84375, "grad_norm_var": 0.007111612955729167, "learning_rate": 0.0001, "loss": 8.1074, "loss/crossentropy": 2.31491482257843, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24610111862421036, "step": 3740 }, { "epoch": 0.233875, "grad_norm": 2.84375, "grad_norm_var": 0.007233683268229167, "learning_rate": 0.0001, "loss": 7.8913, "loss/crossentropy": 2.399028778076172, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24484336376190186, "step": 3742 }, { "epoch": 0.234, "grad_norm": 2.8125, "grad_norm_var": 0.009208170572916667, "learning_rate": 0.0001, "loss": 7.8039, "loss/crossentropy": 2.3483502864837646, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24819336086511612, "step": 3744 }, { "epoch": 0.234125, "grad_norm": 2.796875, "grad_norm_var": 0.0086578369140625, "learning_rate": 0.0001, "loss": 7.8538, "loss/crossentropy": 2.27790105342865, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24142986536026, "step": 3746 }, { "epoch": 0.23425, "grad_norm": 2.671875, "grad_norm_var": 0.008722941080729166, "learning_rate": 0.0001, "loss": 7.8606, "loss/crossentropy": 2.0212570428848267, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23354055732488632, "step": 3748 }, { "epoch": 0.234375, "grad_norm": 2.921875, "grad_norm_var": 0.008771769205729167, "learning_rate": 0.0001, "loss": 7.9268, "loss/crossentropy": 2.212460517883301, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2667820304632187, "step": 3750 }, { "epoch": 0.2345, "grad_norm": 2.796875, "grad_norm_var": 0.0054107666015625, "learning_rate": 0.0001, "loss": 7.8209, "loss/crossentropy": 2.054919123649597, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2387581467628479, "step": 3752 }, { "epoch": 0.234625, "grad_norm": 2.90625, "grad_norm_var": 0.0064117431640625, "learning_rate": 0.0001, "loss": 8.0238, "loss/crossentropy": 2.1775506734848022, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2639275938272476, "step": 3754 }, { "epoch": 0.23475, "grad_norm": 2.8125, "grad_norm_var": 0.007013956705729167, "learning_rate": 0.0001, "loss": 7.9167, "loss/crossentropy": 2.1846723556518555, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23828266561031342, "step": 3756 }, { "epoch": 0.234875, "grad_norm": 3.046875, "grad_norm_var": 0.010286458333333333, "learning_rate": 0.0001, "loss": 7.8777, "loss/crossentropy": 2.134470820426941, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22749686986207962, "step": 3758 }, { "epoch": 0.235, "grad_norm": 2.953125, "grad_norm_var": 0.01025390625, "learning_rate": 0.0001, "loss": 8.009, "loss/crossentropy": 2.284530997276306, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25272058695554733, "step": 3760 }, { "epoch": 0.235125, "grad_norm": 2.671875, "grad_norm_var": 0.01494140625, "learning_rate": 0.0001, "loss": 7.8373, "loss/crossentropy": 2.475208878517151, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2501443922519684, "step": 3762 }, { "epoch": 0.23525, "grad_norm": 2.953125, "grad_norm_var": 0.013102213541666666, "learning_rate": 0.0001, "loss": 7.9143, "loss/crossentropy": 2.037322998046875, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23644433170557022, "step": 3764 }, { "epoch": 0.235375, "grad_norm": 2.953125, "grad_norm_var": 0.013841756184895833, "learning_rate": 0.0001, "loss": 8.0747, "loss/crossentropy": 2.3539260625839233, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24543513357639313, "step": 3766 }, { "epoch": 0.2355, "grad_norm": 3.078125, "grad_norm_var": 0.014281209309895833, "learning_rate": 0.0001, "loss": 8.101, "loss/crossentropy": 2.3604376316070557, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2443503588438034, "step": 3768 }, { "epoch": 0.235625, "grad_norm": 2.71875, "grad_norm_var": 0.023758951822916666, "learning_rate": 0.0001, "loss": 8.0751, "loss/crossentropy": 2.261757254600525, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2623937949538231, "step": 3770 }, { "epoch": 0.23575, "grad_norm": 2.703125, "grad_norm_var": 0.023714192708333335, "learning_rate": 0.0001, "loss": 7.7161, "loss/crossentropy": 2.4158157110214233, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24528972804546356, "step": 3772 }, { "epoch": 0.235875, "grad_norm": 2.75, "grad_norm_var": 0.023844401041666668, "learning_rate": 0.0001, "loss": 7.9815, "loss/crossentropy": 2.449522614479065, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25976794958114624, "step": 3774 }, { "epoch": 0.236, "grad_norm": 2.765625, "grad_norm_var": 0.025093587239583333, "learning_rate": 0.0001, "loss": 8.0425, "loss/crossentropy": 2.3642194271087646, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27245795726776123, "step": 3776 }, { "epoch": 0.236125, "grad_norm": 2.765625, "grad_norm_var": 0.023729451497395835, "learning_rate": 0.0001, "loss": 7.9755, "loss/crossentropy": 2.4502243995666504, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2875143885612488, "step": 3778 }, { "epoch": 0.23625, "grad_norm": 2.9375, "grad_norm_var": 0.026558430989583333, "learning_rate": 0.0001, "loss": 7.8647, "loss/crossentropy": 2.023205041885376, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2414599061012268, "step": 3780 }, { "epoch": 0.236375, "grad_norm": 2.84375, "grad_norm_var": 0.025406901041666666, "learning_rate": 0.0001, "loss": 7.9166, "loss/crossentropy": 2.1861640214920044, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24028976261615753, "step": 3782 }, { "epoch": 0.2365, "grad_norm": 2.625, "grad_norm_var": 0.023628743489583333, "learning_rate": 0.0001, "loss": 7.7592, "loss/crossentropy": 2.204551935195923, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24570683389902115, "step": 3784 }, { "epoch": 0.236625, "grad_norm": 2.90625, "grad_norm_var": 0.011637369791666666, "learning_rate": 0.0001, "loss": 7.8933, "loss/crossentropy": 2.354386806488037, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25208230316638947, "step": 3786 }, { "epoch": 0.23675, "grad_norm": 3.3125, "grad_norm_var": 0.04938151041666667, "learning_rate": 0.0001, "loss": 8.1279, "loss/crossentropy": 2.3427704572677612, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2587417662143707, "step": 3788 }, { "epoch": 0.236875, "grad_norm": 2.84375, "grad_norm_var": 0.047606404622395834, "learning_rate": 0.0001, "loss": 8.0057, "loss/crossentropy": 2.320341467857361, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2580983489751816, "step": 3790 }, { "epoch": 0.237, "grad_norm": 2.71875, "grad_norm_var": 0.04879150390625, "learning_rate": 0.0001, "loss": 8.0361, "loss/crossentropy": 2.3142045736312866, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25220172107219696, "step": 3792 }, { "epoch": 0.237125, "grad_norm": 3.1875, "grad_norm_var": 0.05084228515625, "learning_rate": 0.0001, "loss": 8.0118, "loss/crossentropy": 2.242996096611023, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25726889073848724, "step": 3794 }, { "epoch": 0.23725, "grad_norm": 2.671875, "grad_norm_var": 0.050511678059895836, "learning_rate": 0.0001, "loss": 7.8234, "loss/crossentropy": 2.1468963623046875, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2335699424147606, "step": 3796 }, { "epoch": 0.237375, "grad_norm": 2.984375, "grad_norm_var": 0.05858968098958333, "learning_rate": 0.0001, "loss": 7.8601, "loss/crossentropy": 2.1172631978988647, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.22815796732902527, "step": 3798 }, { "epoch": 0.2375, "grad_norm": 2.984375, "grad_norm_var": 0.054931640625, "learning_rate": 0.0001, "loss": 7.92, "loss/crossentropy": 2.322197914123535, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23355276137590408, "step": 3800 }, { "epoch": 0.237625, "grad_norm": 2.75, "grad_norm_var": 0.05657552083333333, "learning_rate": 0.0001, "loss": 7.7418, "loss/crossentropy": 2.3256465196609497, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2376643791794777, "step": 3802 }, { "epoch": 0.23775, "grad_norm": 2.75, "grad_norm_var": 0.030394490559895834, "learning_rate": 0.0001, "loss": 8.0796, "loss/crossentropy": 2.3443360328674316, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2643965929746628, "step": 3804 }, { "epoch": 0.237875, "grad_norm": 3.0625, "grad_norm_var": 0.0342437744140625, "learning_rate": 0.0001, "loss": 7.8792, "loss/crossentropy": 2.1913982629776, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25000348687171936, "step": 3806 }, { "epoch": 0.238, "grad_norm": 2.90625, "grad_norm_var": 0.03147684733072917, "learning_rate": 0.0001, "loss": 7.8622, "loss/crossentropy": 2.425496220588684, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24364301562309265, "step": 3808 }, { "epoch": 0.238125, "grad_norm": 2.90625, "grad_norm_var": 0.025690714518229168, "learning_rate": 0.0001, "loss": 8.072, "loss/crossentropy": 2.3379809856414795, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2686043605208397, "step": 3810 }, { "epoch": 0.23825, "grad_norm": 3.234375, "grad_norm_var": 0.030171712239583332, "learning_rate": 0.0001, "loss": 7.7219, "loss/crossentropy": 1.885031819343567, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22808341681957245, "step": 3812 }, { "epoch": 0.238375, "grad_norm": 2.765625, "grad_norm_var": 0.025651041666666666, "learning_rate": 0.0001, "loss": 8.0085, "loss/crossentropy": 2.3886247873306274, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24375489354133606, "step": 3814 }, { "epoch": 0.2385, "grad_norm": 2.9375, "grad_norm_var": 0.025202433268229168, "learning_rate": 0.0001, "loss": 7.8644, "loss/crossentropy": 2.130933403968811, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23475061357021332, "step": 3816 }, { "epoch": 0.238625, "grad_norm": 2.78125, "grad_norm_var": 0.024885050455729165, "learning_rate": 0.0001, "loss": 7.9254, "loss/crossentropy": 2.242841362953186, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2557855248451233, "step": 3818 }, { "epoch": 0.23875, "grad_norm": 2.8125, "grad_norm_var": 0.024104817708333334, "learning_rate": 0.0001, "loss": 8.0524, "loss/crossentropy": 2.1857462525367737, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.22692881524562836, "step": 3820 }, { "epoch": 0.238875, "grad_norm": 2.9375, "grad_norm_var": 0.020279947916666666, "learning_rate": 0.0001, "loss": 8.0351, "loss/crossentropy": 2.412648320198059, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24989540874958038, "step": 3822 }, { "epoch": 0.239, "grad_norm": 2.984375, "grad_norm_var": 0.10816650390625, "learning_rate": 0.0001, "loss": 7.9189, "loss/crossentropy": 2.2715072631835938, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24672597646713257, "step": 3824 }, { "epoch": 0.239125, "grad_norm": 3.046875, "grad_norm_var": 0.10906473795572917, "learning_rate": 0.0001, "loss": 7.9206, "loss/crossentropy": 2.26256263256073, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26400326192379, "step": 3826 }, { "epoch": 0.23925, "grad_norm": 2.8125, "grad_norm_var": 0.10342508951822917, "learning_rate": 0.0001, "loss": 7.7574, "loss/crossentropy": 2.229863405227661, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2663080394268036, "step": 3828 }, { "epoch": 0.239375, "grad_norm": 2.859375, "grad_norm_var": 0.09905192057291666, "learning_rate": 0.0001, "loss": 8.0812, "loss/crossentropy": 2.4192259311676025, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26135944575071335, "step": 3830 }, { "epoch": 0.2395, "grad_norm": 2.75, "grad_norm_var": 0.09844462076822917, "learning_rate": 0.0001, "loss": 7.8173, "loss/crossentropy": 2.3621386289596558, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23955464363098145, "step": 3832 }, { "epoch": 0.239625, "grad_norm": 2.953125, "grad_norm_var": 0.09664306640625, "learning_rate": 0.0001, "loss": 7.936, "loss/crossentropy": 2.368203043937683, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26091665029525757, "step": 3834 }, { "epoch": 0.23975, "grad_norm": 2.953125, "grad_norm_var": 0.09709879557291666, "learning_rate": 0.0001, "loss": 7.8929, "loss/crossentropy": 2.2252918481826782, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24965070188045502, "step": 3836 }, { "epoch": 0.239875, "grad_norm": 2.765625, "grad_norm_var": 0.10928446451822917, "learning_rate": 0.0001, "loss": 7.8, "loss/crossentropy": 2.2704564332962036, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25084085017442703, "step": 3838 }, { "epoch": 0.24, "grad_norm": 2.515625, "grad_norm_var": 0.019462076822916667, "learning_rate": 0.0001, "loss": 8.0258, "loss/crossentropy": 2.4409396648406982, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27064767479896545, "step": 3840 }, { "epoch": 0.240125, "grad_norm": 2.734375, "grad_norm_var": 0.01988525390625, "learning_rate": 0.0001, "loss": 8.0067, "loss/crossentropy": 2.4022551774978638, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.26499345898628235, "step": 3842 }, { "epoch": 0.24025, "grad_norm": 2.75, "grad_norm_var": 0.0198150634765625, "learning_rate": 0.0001, "loss": 7.9328, "loss/crossentropy": 2.2620415687561035, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2681596428155899, "step": 3844 }, { "epoch": 0.240375, "grad_norm": 2.890625, "grad_norm_var": 0.019807942708333335, "learning_rate": 0.0001, "loss": 8.0649, "loss/crossentropy": 2.205040454864502, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2555413693189621, "step": 3846 }, { "epoch": 0.2405, "grad_norm": 3.03125, "grad_norm_var": 0.022801717122395832, "learning_rate": 0.0001, "loss": 7.8505, "loss/crossentropy": 2.5771888494491577, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2554461359977722, "step": 3848 }, { "epoch": 0.240625, "grad_norm": 2.65625, "grad_norm_var": 0.022980753580729166, "learning_rate": 0.0001, "loss": 7.7335, "loss/crossentropy": 2.1476651430130005, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24120255559682846, "step": 3850 }, { "epoch": 0.24075, "grad_norm": 3.21875, "grad_norm_var": 0.03230692545572917, "learning_rate": 0.0001, "loss": 8.0287, "loss/crossentropy": 2.435906410217285, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2562536597251892, "step": 3852 }, { "epoch": 0.240875, "grad_norm": 2.90625, "grad_norm_var": 0.028547159830729165, "learning_rate": 0.0001, "loss": 8.0916, "loss/crossentropy": 2.2504727840423584, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24065575003623962, "step": 3854 }, { "epoch": 0.241, "grad_norm": 2.796875, "grad_norm_var": 0.020475260416666665, "learning_rate": 0.0001, "loss": 7.8006, "loss/crossentropy": 2.1379220485687256, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2533913552761078, "step": 3856 }, { "epoch": 0.241125, "grad_norm": 2.71875, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 7.8473, "loss/crossentropy": 2.5523799657821655, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24728696048259735, "step": 3858 }, { "epoch": 0.24125, "grad_norm": 2.65625, "grad_norm_var": 0.021354166666666667, "learning_rate": 0.0001, "loss": 8.0523, "loss/crossentropy": 2.3402522802352905, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23582365363836288, "step": 3860 }, { "epoch": 0.241375, "grad_norm": 3.0625, "grad_norm_var": 0.0236968994140625, "learning_rate": 0.0001, "loss": 7.8628, "loss/crossentropy": 2.312021493911743, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25287964940071106, "step": 3862 }, { "epoch": 0.2415, "grad_norm": 3.21875, "grad_norm_var": 0.18788655598958334, "learning_rate": 0.0001, "loss": 7.9496, "loss/crossentropy": 2.5552531480789185, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26104636490345, "step": 3864 }, { "epoch": 0.241625, "grad_norm": 2.78125, "grad_norm_var": 0.18173726399739584, "learning_rate": 0.0001, "loss": 7.9764, "loss/crossentropy": 2.3297171592712402, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.250930480659008, "step": 3866 }, { "epoch": 0.24175, "grad_norm": 2.984375, "grad_norm_var": 0.17805989583333334, "learning_rate": 0.0001, "loss": 7.9646, "loss/crossentropy": 2.406034827232361, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27073846757411957, "step": 3868 }, { "epoch": 0.241875, "grad_norm": 2.9375, "grad_norm_var": 0.18010152180989583, "learning_rate": 0.0001, "loss": 8.0221, "loss/crossentropy": 2.354939103126526, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2731742113828659, "step": 3870 }, { "epoch": 0.242, "grad_norm": 2.828125, "grad_norm_var": 0.18245035807291668, "learning_rate": 0.0001, "loss": 7.8674, "loss/crossentropy": 2.493025302886963, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23660879582166672, "step": 3872 }, { "epoch": 0.242125, "grad_norm": 2.78125, "grad_norm_var": 0.17879130045572916, "learning_rate": 0.0001, "loss": 8.1985, "loss/crossentropy": 2.4070109128952026, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2546348571777344, "step": 3874 }, { "epoch": 0.24225, "grad_norm": 2.9375, "grad_norm_var": 0.17639058430989582, "learning_rate": 0.0001, "loss": 7.8092, "loss/crossentropy": 2.3700718879699707, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2518589720129967, "step": 3876 }, { "epoch": 0.242375, "grad_norm": 3.078125, "grad_norm_var": 0.17805582682291668, "learning_rate": 0.0001, "loss": 7.9985, "loss/crossentropy": 2.338467597961426, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25343357026576996, "step": 3878 }, { "epoch": 0.2425, "grad_norm": 2.921875, "grad_norm_var": 0.011522420247395833, "learning_rate": 0.0001, "loss": 8.071, "loss/crossentropy": 2.1937835216522217, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25322993099689484, "step": 3880 }, { "epoch": 0.242625, "grad_norm": 3.046875, "grad_norm_var": 0.0135650634765625, "learning_rate": 0.0001, "loss": 7.8869, "loss/crossentropy": 2.3660850524902344, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.252712182700634, "step": 3882 }, { "epoch": 0.24275, "grad_norm": 2.71875, "grad_norm_var": 0.014533487955729167, "learning_rate": 0.0001, "loss": 7.7893, "loss/crossentropy": 2.2534600496292114, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24849370121955872, "step": 3884 }, { "epoch": 0.242875, "grad_norm": 2.75, "grad_norm_var": 0.0155426025390625, "learning_rate": 0.0001, "loss": 7.7321, "loss/crossentropy": 1.9124953746795654, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2445899397134781, "step": 3886 }, { "epoch": 0.243, "grad_norm": 3.03125, "grad_norm_var": 0.0174224853515625, "learning_rate": 0.0001, "loss": 8.0921, "loss/crossentropy": 2.450052261352539, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2614727318286896, "step": 3888 }, { "epoch": 0.243125, "grad_norm": 2.703125, "grad_norm_var": 0.0187652587890625, "learning_rate": 0.0001, "loss": 7.8815, "loss/crossentropy": 2.2794657945632935, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2508957237005234, "step": 3890 }, { "epoch": 0.24325, "grad_norm": 2.96875, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 7.8555, "loss/crossentropy": 2.282740831375122, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2370600551366806, "step": 3892 }, { "epoch": 0.243375, "grad_norm": 3.09375, "grad_norm_var": 0.017317708333333334, "learning_rate": 0.0001, "loss": 8.0816, "loss/crossentropy": 2.106474459171295, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26941801607608795, "step": 3894 }, { "epoch": 0.2435, "grad_norm": 2.96875, "grad_norm_var": 0.019456990559895835, "learning_rate": 0.0001, "loss": 8.1698, "loss/crossentropy": 2.224056601524353, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2755711078643799, "step": 3896 }, { "epoch": 0.243625, "grad_norm": 2.828125, "grad_norm_var": 0.017414347330729166, "learning_rate": 0.0001, "loss": 7.9395, "loss/crossentropy": 2.1422271728515625, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2343953251838684, "step": 3898 }, { "epoch": 0.24375, "grad_norm": 3.0625, "grad_norm_var": 0.0193511962890625, "learning_rate": 0.0001, "loss": 8.1452, "loss/crossentropy": 2.2681803703308105, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2323412001132965, "step": 3900 }, { "epoch": 0.243875, "grad_norm": 2.78125, "grad_norm_var": 0.021793619791666666, "learning_rate": 0.0001, "loss": 7.7695, "loss/crossentropy": 2.108498513698578, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2429059073328972, "step": 3902 }, { "epoch": 0.244, "grad_norm": 2.78125, "grad_norm_var": 0.019710286458333334, "learning_rate": 0.0001, "loss": 7.9059, "loss/crossentropy": 2.3447383642196655, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2611192911863327, "step": 3904 }, { "epoch": 0.244125, "grad_norm": 3.03125, "grad_norm_var": 0.020807902018229168, "learning_rate": 0.0001, "loss": 7.9542, "loss/crossentropy": 1.840607225894928, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23171918094158173, "step": 3906 }, { "epoch": 0.24425, "grad_norm": 2.765625, "grad_norm_var": 0.020882161458333333, "learning_rate": 0.0001, "loss": 7.7736, "loss/crossentropy": 2.113314151763916, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23394189774990082, "step": 3908 }, { "epoch": 0.244375, "grad_norm": 3.125, "grad_norm_var": 0.021549479166666666, "learning_rate": 0.0001, "loss": 7.9819, "loss/crossentropy": 2.252769947052002, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25148941576480865, "step": 3910 }, { "epoch": 0.2445, "grad_norm": 2.953125, "grad_norm_var": 0.027567545572916668, "learning_rate": 0.0001, "loss": 8.0101, "loss/crossentropy": 2.1127032041549683, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2942749857902527, "step": 3912 }, { "epoch": 0.244625, "grad_norm": 2.953125, "grad_norm_var": 0.02822265625, "learning_rate": 0.0001, "loss": 8.1576, "loss/crossentropy": 2.096681237220764, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2617676854133606, "step": 3914 }, { "epoch": 0.24475, "grad_norm": 2.9375, "grad_norm_var": 0.024837239583333334, "learning_rate": 0.0001, "loss": 7.9193, "loss/crossentropy": 2.312508463859558, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27543358504772186, "step": 3916 }, { "epoch": 0.244875, "grad_norm": 2.578125, "grad_norm_var": 0.0254547119140625, "learning_rate": 0.0001, "loss": 8.0867, "loss/crossentropy": 2.5416085720062256, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2532195746898651, "step": 3918 }, { "epoch": 0.245, "grad_norm": 2.8125, "grad_norm_var": 0.028319295247395834, "learning_rate": 0.0001, "loss": 7.8437, "loss/crossentropy": 1.9256672859191895, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23992876708507538, "step": 3920 }, { "epoch": 0.245125, "grad_norm": 2.71875, "grad_norm_var": 0.027083333333333334, "learning_rate": 0.0001, "loss": 7.7237, "loss/crossentropy": 2.0629305839538574, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2529604434967041, "step": 3922 }, { "epoch": 0.24525, "grad_norm": 2.9375, "grad_norm_var": 0.0267974853515625, "learning_rate": 0.0001, "loss": 8.2066, "loss/crossentropy": 2.533225417137146, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2775707393884659, "step": 3924 }, { "epoch": 0.245375, "grad_norm": 3.125, "grad_norm_var": 0.028246053059895835, "learning_rate": 0.0001, "loss": 7.8036, "loss/crossentropy": 2.3668118715286255, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24234110862016678, "step": 3926 }, { "epoch": 0.2455, "grad_norm": 2.71875, "grad_norm_var": 0.018382771809895834, "learning_rate": 0.0001, "loss": 7.7284, "loss/crossentropy": 2.1942179203033447, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25301945209503174, "step": 3928 }, { "epoch": 0.245625, "grad_norm": 3.3125, "grad_norm_var": 0.0335113525390625, "learning_rate": 0.0001, "loss": 7.9152, "loss/crossentropy": 2.3835543394088745, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24918851256370544, "step": 3930 }, { "epoch": 0.24575, "grad_norm": 2.828125, "grad_norm_var": 0.040848795572916666, "learning_rate": 0.0001, "loss": 8.0722, "loss/crossentropy": 2.275932550430298, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27397048473358154, "step": 3932 }, { "epoch": 0.245875, "grad_norm": 2.84375, "grad_norm_var": 0.03684794108072917, "learning_rate": 0.0001, "loss": 7.7922, "loss/crossentropy": 2.214387059211731, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25321590155363083, "step": 3934 }, { "epoch": 0.246, "grad_norm": 2.78125, "grad_norm_var": 0.034821573893229166, "learning_rate": 0.0001, "loss": 7.7217, "loss/crossentropy": 2.2051841020584106, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2417343333363533, "step": 3936 }, { "epoch": 0.246125, "grad_norm": 2.859375, "grad_norm_var": 0.0321441650390625, "learning_rate": 0.0001, "loss": 7.7859, "loss/crossentropy": 2.1569536924362183, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24789589643478394, "step": 3938 }, { "epoch": 0.24625, "grad_norm": 2.71875, "grad_norm_var": 0.0367828369140625, "learning_rate": 0.0001, "loss": 7.9834, "loss/crossentropy": 2.171297550201416, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2569433003664017, "step": 3940 }, { "epoch": 0.246375, "grad_norm": 3.1875, "grad_norm_var": 0.03746337890625, "learning_rate": 0.0001, "loss": 8.0254, "loss/crossentropy": 2.353532552719116, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26509344577789307, "step": 3942 }, { "epoch": 0.2465, "grad_norm": 2.875, "grad_norm_var": 0.03327534993489583, "learning_rate": 0.0001, "loss": 8.0222, "loss/crossentropy": 2.264642834663391, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26274262368679047, "step": 3944 }, { "epoch": 0.246625, "grad_norm": 2.875, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 8.0714, "loss/crossentropy": 2.372550845146179, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25714441388845444, "step": 3946 }, { "epoch": 0.24675, "grad_norm": 2.65625, "grad_norm_var": 0.022980753580729166, "learning_rate": 0.0001, "loss": 7.9289, "loss/crossentropy": 2.4531720876693726, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2564563602209091, "step": 3948 }, { "epoch": 0.246875, "grad_norm": 3.0, "grad_norm_var": 0.02154541015625, "learning_rate": 0.0001, "loss": 8.1788, "loss/crossentropy": 2.4645724296569824, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2531467527151108, "step": 3950 }, { "epoch": 0.247, "grad_norm": 2.84375, "grad_norm_var": 0.020335896809895834, "learning_rate": 0.0001, "loss": 8.087, "loss/crossentropy": 2.2224488258361816, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.27072136104106903, "step": 3952 }, { "epoch": 0.247125, "grad_norm": 2.75, "grad_norm_var": 0.021564737955729166, "learning_rate": 0.0001, "loss": 7.7025, "loss/crossentropy": 2.1155717372894287, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2502880319952965, "step": 3954 }, { "epoch": 0.24725, "grad_norm": 2.859375, "grad_norm_var": 0.015901692708333335, "learning_rate": 0.0001, "loss": 7.9457, "loss/crossentropy": 2.156541883945465, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25808054208755493, "step": 3956 }, { "epoch": 0.247375, "grad_norm": 2.71875, "grad_norm_var": 0.009040323893229167, "learning_rate": 0.0001, "loss": 7.6475, "loss/crossentropy": 2.043439030647278, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22348909080028534, "step": 3958 }, { "epoch": 0.2475, "grad_norm": 2.953125, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 7.7652, "loss/crossentropy": 2.283087968826294, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2664376199245453, "step": 3960 }, { "epoch": 0.247625, "grad_norm": 2.953125, "grad_norm_var": 0.0106109619140625, "learning_rate": 0.0001, "loss": 8.1861, "loss/crossentropy": 2.446289896965027, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24126730859279633, "step": 3962 }, { "epoch": 0.24775, "grad_norm": 2.984375, "grad_norm_var": 0.021024576822916665, "learning_rate": 0.0001, "loss": 8.0295, "loss/crossentropy": 2.106830358505249, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2426551803946495, "step": 3964 }, { "epoch": 0.247875, "grad_norm": 3.03125, "grad_norm_var": 0.022541300455729166, "learning_rate": 0.0001, "loss": 8.0848, "loss/crossentropy": 2.434194803237915, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24056891351938248, "step": 3966 }, { "epoch": 0.248, "grad_norm": 2.671875, "grad_norm_var": 0.03730061848958333, "learning_rate": 0.0001, "loss": 7.9573, "loss/crossentropy": 2.1523804664611816, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24403362721204758, "step": 3968 }, { "epoch": 0.248125, "grad_norm": 2.765625, "grad_norm_var": 0.03892822265625, "learning_rate": 0.0001, "loss": 7.8268, "loss/crossentropy": 2.2469995617866516, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2514503300189972, "step": 3970 }, { "epoch": 0.24825, "grad_norm": 2.703125, "grad_norm_var": 0.0411285400390625, "learning_rate": 0.0001, "loss": 7.9338, "loss/crossentropy": 2.1976422667503357, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24978452920913696, "step": 3972 }, { "epoch": 0.248375, "grad_norm": 2.84375, "grad_norm_var": 0.03876851399739583, "learning_rate": 0.0001, "loss": 7.6979, "loss/crossentropy": 2.2436105012893677, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2375164031982422, "step": 3974 }, { "epoch": 0.2485, "grad_norm": 2.796875, "grad_norm_var": 0.037718709309895834, "learning_rate": 0.0001, "loss": 7.8885, "loss/crossentropy": 2.1020787954330444, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22866419702768326, "step": 3976 }, { "epoch": 0.248625, "grad_norm": 3.296875, "grad_norm_var": 0.047362263997395834, "learning_rate": 0.0001, "loss": 8.104, "loss/crossentropy": 2.442590594291687, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2711693271994591, "step": 3978 }, { "epoch": 0.24875, "grad_norm": 2.890625, "grad_norm_var": 0.03736063639322917, "learning_rate": 0.0001, "loss": 7.8339, "loss/crossentropy": 2.3493313789367676, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24219609797000885, "step": 3980 }, { "epoch": 0.248875, "grad_norm": 2.625, "grad_norm_var": 0.0386383056640625, "learning_rate": 0.0001, "loss": 7.7534, "loss/crossentropy": 2.0476667881011963, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.239098958671093, "step": 3982 }, { "epoch": 0.249, "grad_norm": 2.703125, "grad_norm_var": 0.025129191080729165, "learning_rate": 0.0001, "loss": 8.087, "loss/crossentropy": 2.547991156578064, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2603266090154648, "step": 3984 }, { "epoch": 0.249125, "grad_norm": 2.828125, "grad_norm_var": 0.0277984619140625, "learning_rate": 0.0001, "loss": 7.9039, "loss/crossentropy": 2.645310878753662, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2637501657009125, "step": 3986 }, { "epoch": 0.24925, "grad_norm": 2.90625, "grad_norm_var": 0.031078084309895834, "learning_rate": 0.0001, "loss": 7.9646, "loss/crossentropy": 2.2159535884857178, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25144505500793457, "step": 3988 }, { "epoch": 0.249375, "grad_norm": 2.890625, "grad_norm_var": 0.03127339680989583, "learning_rate": 0.0001, "loss": 7.9682, "loss/crossentropy": 2.3531523942947388, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2576625347137451, "step": 3990 }, { "epoch": 0.2495, "grad_norm": 2.71875, "grad_norm_var": 0.032515462239583334, "learning_rate": 0.0001, "loss": 7.9189, "loss/crossentropy": 2.2137093544006348, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2575615271925926, "step": 3992 }, { "epoch": 0.249625, "grad_norm": 2.671875, "grad_norm_var": 0.021842447916666667, "learning_rate": 0.0001, "loss": 7.8345, "loss/crossentropy": 2.3585588932037354, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2596537619829178, "step": 3994 }, { "epoch": 0.24975, "grad_norm": 3.015625, "grad_norm_var": 0.027074178059895832, "learning_rate": 0.0001, "loss": 7.9586, "loss/crossentropy": 2.2737538814544678, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25356176495552063, "step": 3996 }, { "epoch": 0.249875, "grad_norm": 2.921875, "grad_norm_var": 0.0232330322265625, "learning_rate": 0.0001, "loss": 7.9267, "loss/crossentropy": 2.1807756423950195, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2504650503396988, "step": 3998 }, { "epoch": 0.25, "grad_norm": 2.9375, "grad_norm_var": 0.01959228515625, "learning_rate": 0.0001, "loss": 7.9314, "loss/crossentropy": 2.3070114850997925, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2451673448085785, "step": 4000 }, { "epoch": 0.250125, "grad_norm": 2.984375, "grad_norm_var": 0.023758951822916666, "learning_rate": 0.0001, "loss": 7.9656, "loss/crossentropy": 2.086524724960327, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.22823181003332138, "step": 4002 }, { "epoch": 0.25025, "grad_norm": 2.734375, "grad_norm_var": 0.022623697916666668, "learning_rate": 0.0001, "loss": 7.8587, "loss/crossentropy": 2.266258955001831, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24971243739128113, "step": 4004 }, { "epoch": 0.250375, "grad_norm": 2.890625, "grad_norm_var": 0.023844401041666668, "learning_rate": 0.0001, "loss": 7.8428, "loss/crossentropy": 2.1357154846191406, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23950235545635223, "step": 4006 }, { "epoch": 0.2505, "grad_norm": 2.8125, "grad_norm_var": 0.02437744140625, "learning_rate": 0.0001, "loss": 7.867, "loss/crossentropy": 2.4124141931533813, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.267966166138649, "step": 4008 }, { "epoch": 0.250625, "grad_norm": 2.796875, "grad_norm_var": 0.0216705322265625, "learning_rate": 0.0001, "loss": 7.9301, "loss/crossentropy": 2.1274200677871704, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23959754407405853, "step": 4010 }, { "epoch": 0.25075, "grad_norm": 2.953125, "grad_norm_var": 0.0292144775390625, "learning_rate": 0.0001, "loss": 8.1943, "loss/crossentropy": 2.423035979270935, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2570289820432663, "step": 4012 }, { "epoch": 0.250875, "grad_norm": 2.8125, "grad_norm_var": 0.02926025390625, "learning_rate": 0.0001, "loss": 7.8874, "loss/crossentropy": 2.096481442451477, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25379370152950287, "step": 4014 }, { "epoch": 0.251, "grad_norm": 2.796875, "grad_norm_var": 0.029541015625, "learning_rate": 0.0001, "loss": 8.0024, "loss/crossentropy": 2.1518774032592773, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24547205865383148, "step": 4016 }, { "epoch": 0.251125, "grad_norm": 3.0, "grad_norm_var": 0.024658203125, "learning_rate": 0.0001, "loss": 7.9304, "loss/crossentropy": 2.1806546449661255, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25979387760162354, "step": 4018 }, { "epoch": 0.25125, "grad_norm": 2.953125, "grad_norm_var": 0.020531209309895833, "learning_rate": 0.0001, "loss": 7.9947, "loss/crossentropy": 2.533255457878113, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24436841905117035, "step": 4020 }, { "epoch": 0.251375, "grad_norm": 2.84375, "grad_norm_var": 0.018871053059895834, "learning_rate": 0.0001, "loss": 8.0565, "loss/crossentropy": 2.113765239715576, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23419301211833954, "step": 4022 }, { "epoch": 0.2515, "grad_norm": 2.828125, "grad_norm_var": 0.022037760416666666, "learning_rate": 0.0001, "loss": 7.6505, "loss/crossentropy": 2.2082241773605347, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.251997672021389, "step": 4024 }, { "epoch": 0.251625, "grad_norm": 2.828125, "grad_norm_var": 0.0241607666015625, "learning_rate": 0.0001, "loss": 7.9258, "loss/crossentropy": 2.114282548427582, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23022376000881195, "step": 4026 }, { "epoch": 0.25175, "grad_norm": 2.890625, "grad_norm_var": 0.013602701822916667, "learning_rate": 0.0001, "loss": 7.7699, "loss/crossentropy": 2.2350351810455322, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23465155065059662, "step": 4028 }, { "epoch": 0.251875, "grad_norm": 2.984375, "grad_norm_var": 0.033426920572916664, "learning_rate": 0.0001, "loss": 8.0496, "loss/crossentropy": 2.4656600952148438, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2750798910856247, "step": 4030 }, { "epoch": 0.252, "grad_norm": 2.859375, "grad_norm_var": 0.03725484212239583, "learning_rate": 0.0001, "loss": 7.77, "loss/crossentropy": 2.2551556825637817, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2375420331954956, "step": 4032 }, { "epoch": 0.252125, "grad_norm": 2.75, "grad_norm_var": 0.03486226399739583, "learning_rate": 0.0001, "loss": 7.8386, "loss/crossentropy": 2.3349199295043945, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25754740834236145, "step": 4034 }, { "epoch": 0.25225, "grad_norm": 2.6875, "grad_norm_var": 0.035888671875, "learning_rate": 0.0001, "loss": 7.7576, "loss/crossentropy": 2.240418553352356, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23132948577404022, "step": 4036 }, { "epoch": 0.252375, "grad_norm": 2.96875, "grad_norm_var": 0.037398274739583334, "learning_rate": 0.0001, "loss": 8.0782, "loss/crossentropy": 2.6016796827316284, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24654637277126312, "step": 4038 }, { "epoch": 0.2525, "grad_norm": 2.640625, "grad_norm_var": 0.03619384765625, "learning_rate": 0.0001, "loss": 7.9179, "loss/crossentropy": 2.189319610595703, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2414143830537796, "step": 4040 }, { "epoch": 0.252625, "grad_norm": 2.875, "grad_norm_var": 0.03726806640625, "learning_rate": 0.0001, "loss": 7.8792, "loss/crossentropy": 2.339111566543579, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2566460520029068, "step": 4042 }, { "epoch": 0.25275, "grad_norm": 2.921875, "grad_norm_var": 0.03737691243489583, "learning_rate": 0.0001, "loss": 8.0226, "loss/crossentropy": 2.5018938779830933, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2496292144060135, "step": 4044 }, { "epoch": 0.252875, "grad_norm": 2.96875, "grad_norm_var": 0.01529541015625, "learning_rate": 0.0001, "loss": 7.7494, "loss/crossentropy": 2.3140684366226196, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24685319513082504, "step": 4046 }, { "epoch": 0.253, "grad_norm": 2.890625, "grad_norm_var": 0.014037068684895833, "learning_rate": 0.0001, "loss": 8.0641, "loss/crossentropy": 2.472969889640808, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.29279862344264984, "step": 4048 }, { "epoch": 0.253125, "grad_norm": 2.9375, "grad_norm_var": 0.013914998372395833, "learning_rate": 0.0001, "loss": 8.2353, "loss/crossentropy": 2.499064326286316, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24005597829818726, "step": 4050 }, { "epoch": 0.25325, "grad_norm": 2.78125, "grad_norm_var": 0.012858072916666666, "learning_rate": 0.0001, "loss": 7.8839, "loss/crossentropy": 2.287570834159851, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24140357226133347, "step": 4052 }, { "epoch": 0.253375, "grad_norm": 2.859375, "grad_norm_var": 0.016585286458333334, "learning_rate": 0.0001, "loss": 7.8599, "loss/crossentropy": 2.255461096763611, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.27384766936302185, "step": 4054 }, { "epoch": 0.2535, "grad_norm": 2.59375, "grad_norm_var": 0.018538411458333334, "learning_rate": 0.0001, "loss": 7.8689, "loss/crossentropy": 2.260550379753113, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26458389312028885, "step": 4056 }, { "epoch": 0.253625, "grad_norm": 2.859375, "grad_norm_var": 0.016829427083333334, "learning_rate": 0.0001, "loss": 8.0604, "loss/crossentropy": 2.5747987031936646, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24792767316102982, "step": 4058 }, { "epoch": 0.25375, "grad_norm": 2.984375, "grad_norm_var": 0.02066650390625, "learning_rate": 0.0001, "loss": 7.8461, "loss/crossentropy": 2.1469414830207825, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24173255264759064, "step": 4060 }, { "epoch": 0.253875, "grad_norm": 2.65625, "grad_norm_var": 0.022175089518229166, "learning_rate": 0.0001, "loss": 7.6776, "loss/crossentropy": 2.221487045288086, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2288023829460144, "step": 4062 }, { "epoch": 0.254, "grad_norm": 2.734375, "grad_norm_var": 0.021903483072916667, "learning_rate": 0.0001, "loss": 7.9139, "loss/crossentropy": 2.248312473297119, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2824964225292206, "step": 4064 }, { "epoch": 0.254125, "grad_norm": 2.640625, "grad_norm_var": 0.027489217122395833, "learning_rate": 0.0001, "loss": 7.8409, "loss/crossentropy": 2.3971487283706665, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2429996207356453, "step": 4066 }, { "epoch": 0.25425, "grad_norm": 2.90625, "grad_norm_var": 0.028123982747395835, "learning_rate": 0.0001, "loss": 8.2937, "loss/crossentropy": 2.2593578100204468, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.25377459824085236, "step": 4068 }, { "epoch": 0.254375, "grad_norm": 2.96875, "grad_norm_var": 0.025227864583333332, "learning_rate": 0.0001, "loss": 8.0835, "loss/crossentropy": 2.4905598163604736, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.27597957849502563, "step": 4070 }, { "epoch": 0.2545, "grad_norm": 2.765625, "grad_norm_var": 0.020719401041666665, "learning_rate": 0.0001, "loss": 7.8124, "loss/crossentropy": 2.069876194000244, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23079124093055725, "step": 4072 }, { "epoch": 0.254625, "grad_norm": 3.09375, "grad_norm_var": 0.027262369791666668, "learning_rate": 0.0001, "loss": 7.8281, "loss/crossentropy": 2.1508991718292236, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22409434616565704, "step": 4074 }, { "epoch": 0.25475, "grad_norm": 2.703125, "grad_norm_var": 0.023981730143229168, "learning_rate": 0.0001, "loss": 7.8602, "loss/crossentropy": 2.140303373336792, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24273252487182617, "step": 4076 }, { "epoch": 0.254875, "grad_norm": 2.9375, "grad_norm_var": 0.023542277018229165, "learning_rate": 0.0001, "loss": 8.0033, "loss/crossentropy": 2.312852382659912, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24756643921136856, "step": 4078 }, { "epoch": 0.255, "grad_norm": 2.703125, "grad_norm_var": 0.025641886393229167, "learning_rate": 0.0001, "loss": 7.7321, "loss/crossentropy": 2.2065231800079346, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2472347617149353, "step": 4080 }, { "epoch": 0.255125, "grad_norm": 2.71875, "grad_norm_var": 0.0214996337890625, "learning_rate": 0.0001, "loss": 7.9327, "loss/crossentropy": 2.3910540342330933, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25117581337690353, "step": 4082 }, { "epoch": 0.25525, "grad_norm": 3.015625, "grad_norm_var": 0.023631795247395834, "learning_rate": 0.0001, "loss": 8.0083, "loss/crossentropy": 2.331396460533142, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2417784184217453, "step": 4084 }, { "epoch": 0.255375, "grad_norm": 2.625, "grad_norm_var": 0.022456868489583334, "learning_rate": 0.0001, "loss": 7.6835, "loss/crossentropy": 2.0976497530937195, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2585399001836777, "step": 4086 }, { "epoch": 0.2555, "grad_norm": 2.9375, "grad_norm_var": 0.023363240559895835, "learning_rate": 0.0001, "loss": 8.0152, "loss/crossentropy": 2.535896420478821, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25609172880649567, "step": 4088 }, { "epoch": 0.255625, "grad_norm": 2.890625, "grad_norm_var": 0.01826171875, "learning_rate": 0.0001, "loss": 8.1267, "loss/crossentropy": 2.3418773412704468, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2815869003534317, "step": 4090 }, { "epoch": 0.25575, "grad_norm": 2.90625, "grad_norm_var": 0.019189453125, "learning_rate": 0.0001, "loss": 8.1099, "loss/crossentropy": 2.432576060295105, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.254299134016037, "step": 4092 }, { "epoch": 0.255875, "grad_norm": 2.90625, "grad_norm_var": 0.020099894205729166, "learning_rate": 0.0001, "loss": 7.811, "loss/crossentropy": 2.3323466777801514, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2356455698609352, "step": 4094 }, { "epoch": 0.256, "grad_norm": 3.5, "grad_norm_var": 0.33955790201822916, "learning_rate": 0.0001, "loss": 7.886, "loss/crossentropy": 2.1701765060424805, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.22543915361166, "step": 4096 }, { "epoch": 0.256125, "grad_norm": 2.9375, "grad_norm_var": 0.33069254557291666, "learning_rate": 0.0001, "loss": 7.894, "loss/crossentropy": 2.4592390060424805, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26361630856990814, "step": 4098 }, { "epoch": 0.25625, "grad_norm": 2.90625, "grad_norm_var": 0.34390869140625, "learning_rate": 0.0001, "loss": 7.8103, "loss/crossentropy": 2.5309205055236816, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2621408849954605, "step": 4100 }, { "epoch": 0.256375, "grad_norm": 3.015625, "grad_norm_var": 0.3298136393229167, "learning_rate": 0.0001, "loss": 7.7602, "loss/crossentropy": 2.206428825855255, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.22442228347063065, "step": 4102 }, { "epoch": 0.2565, "grad_norm": 2.953125, "grad_norm_var": 0.3256907145182292, "learning_rate": 0.0001, "loss": 8.0156, "loss/crossentropy": 2.3760149478912354, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2524143010377884, "step": 4104 }, { "epoch": 0.256625, "grad_norm": 3.078125, "grad_norm_var": 0.32652079264322914, "learning_rate": 0.0001, "loss": 8.1853, "loss/crossentropy": 2.282678723335266, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26595038920640945, "step": 4106 }, { "epoch": 0.25675, "grad_norm": 3.1875, "grad_norm_var": 0.3243326822916667, "learning_rate": 0.0001, "loss": 7.8999, "loss/crossentropy": 2.182901620864868, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2541113644838333, "step": 4108 }, { "epoch": 0.256875, "grad_norm": 2.546875, "grad_norm_var": 0.34641927083333335, "learning_rate": 0.0001, "loss": 7.7066, "loss/crossentropy": 2.158617377281189, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.21645865589380264, "step": 4110 }, { "epoch": 0.257, "grad_norm": 2.90625, "grad_norm_var": 0.049738566080729164, "learning_rate": 0.0001, "loss": 7.9337, "loss/crossentropy": 2.4083575010299683, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24169963598251343, "step": 4112 }, { "epoch": 0.257125, "grad_norm": 2.609375, "grad_norm_var": 0.061669921875, "learning_rate": 0.0001, "loss": 7.9745, "loss/crossentropy": 2.2410330772399902, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24847444891929626, "step": 4114 }, { "epoch": 0.25725, "grad_norm": 2.875, "grad_norm_var": 0.0601470947265625, "learning_rate": 0.0001, "loss": 8.112, "loss/crossentropy": 2.3139537572860718, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26485373079776764, "step": 4116 }, { "epoch": 0.257375, "grad_norm": 2.828125, "grad_norm_var": 0.0606109619140625, "learning_rate": 0.0001, "loss": 7.9185, "loss/crossentropy": 2.2979514598846436, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26377592980861664, "step": 4118 }, { "epoch": 0.2575, "grad_norm": 2.828125, "grad_norm_var": 0.060319010416666666, "learning_rate": 0.0001, "loss": 8.071, "loss/crossentropy": 2.425565242767334, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2566176801919937, "step": 4120 }, { "epoch": 0.257625, "grad_norm": 2.65625, "grad_norm_var": 0.06758524576822916, "learning_rate": 0.0001, "loss": 7.5937, "loss/crossentropy": 2.242913246154785, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.22992441803216934, "step": 4122 }, { "epoch": 0.25775, "grad_norm": 2.734375, "grad_norm_var": 0.058980305989583336, "learning_rate": 0.0001, "loss": 7.873, "loss/crossentropy": 2.265519142150879, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25814615190029144, "step": 4124 }, { "epoch": 0.257875, "grad_norm": 2.921875, "grad_norm_var": 0.05207417805989583, "learning_rate": 0.0001, "loss": 7.9037, "loss/crossentropy": 2.2956472635269165, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26307499408721924, "step": 4126 }, { "epoch": 0.258, "grad_norm": 2.859375, "grad_norm_var": 0.0243804931640625, "learning_rate": 0.0001, "loss": 7.997, "loss/crossentropy": 2.37142550945282, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.30740588903427124, "step": 4128 }, { "epoch": 0.258125, "grad_norm": 2.84375, "grad_norm_var": 0.013109334309895833, "learning_rate": 0.0001, "loss": 7.922, "loss/crossentropy": 2.4662058353424072, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2652098014950752, "step": 4130 }, { "epoch": 0.25825, "grad_norm": 2.75, "grad_norm_var": 0.0121734619140625, "learning_rate": 0.0001, "loss": 8.0041, "loss/crossentropy": 2.4391783475875854, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25314761698246, "step": 4132 }, { "epoch": 0.258375, "grad_norm": 2.5, "grad_norm_var": 0.017096964518229167, "learning_rate": 0.0001, "loss": 7.9945, "loss/crossentropy": 2.1940054893493652, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24650320410728455, "step": 4134 }, { "epoch": 0.2585, "grad_norm": 2.65625, "grad_norm_var": 0.016144816080729166, "learning_rate": 0.0001, "loss": 7.8918, "loss/crossentropy": 2.313614845275879, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2415396273136139, "step": 4136 }, { "epoch": 0.258625, "grad_norm": 3.015625, "grad_norm_var": 0.016852823893229167, "learning_rate": 0.0001, "loss": 7.8513, "loss/crossentropy": 2.2070964574813843, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23357922583818436, "step": 4138 }, { "epoch": 0.25875, "grad_norm": 2.671875, "grad_norm_var": 0.014867146809895834, "learning_rate": 0.0001, "loss": 7.8301, "loss/crossentropy": 2.1131896376609802, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25203292071819305, "step": 4140 }, { "epoch": 0.258875, "grad_norm": 2.890625, "grad_norm_var": 0.01490478515625, "learning_rate": 0.0001, "loss": 7.8576, "loss/crossentropy": 2.1146705746650696, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23909609764814377, "step": 4142 }, { "epoch": 0.259, "grad_norm": 3.0625, "grad_norm_var": 0.024592081705729168, "learning_rate": 0.0001, "loss": 8.1841, "loss/crossentropy": 2.4022799730300903, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2597605586051941, "step": 4144 }, { "epoch": 0.259125, "grad_norm": 2.65625, "grad_norm_var": 0.08146870930989583, "learning_rate": 0.0001, "loss": 7.9961, "loss/crossentropy": 2.4074816703796387, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2639220803976059, "step": 4146 }, { "epoch": 0.25925, "grad_norm": 2.9375, "grad_norm_var": 0.08469950358072917, "learning_rate": 0.0001, "loss": 7.9056, "loss/crossentropy": 2.1917613744735718, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24751102924346924, "step": 4148 }, { "epoch": 0.259375, "grad_norm": 2.75, "grad_norm_var": 0.07756245930989583, "learning_rate": 0.0001, "loss": 7.7414, "loss/crossentropy": 2.2915488481521606, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2460872009396553, "step": 4150 }, { "epoch": 0.2595, "grad_norm": 2.703125, "grad_norm_var": 0.07736002604166667, "learning_rate": 0.0001, "loss": 7.9329, "loss/crossentropy": 2.320430040359497, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24977800250053406, "step": 4152 }, { "epoch": 0.259625, "grad_norm": 3.09375, "grad_norm_var": 0.08010660807291667, "learning_rate": 0.0001, "loss": 8.0587, "loss/crossentropy": 2.563572406768799, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25170084834098816, "step": 4154 }, { "epoch": 0.25975, "grad_norm": 2.59375, "grad_norm_var": 0.0822174072265625, "learning_rate": 0.0001, "loss": 7.8451, "loss/crossentropy": 2.32357120513916, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26385223865509033, "step": 4156 }, { "epoch": 0.259875, "grad_norm": 2.734375, "grad_norm_var": 0.08178609212239583, "learning_rate": 0.0001, "loss": 7.7284, "loss/crossentropy": 2.0245505571365356, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2393287643790245, "step": 4158 }, { "epoch": 0.26, "grad_norm": 2.828125, "grad_norm_var": 0.07543843587239583, "learning_rate": 0.0001, "loss": 7.8159, "loss/crossentropy": 2.474648594856262, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2618861794471741, "step": 4160 }, { "epoch": 0.260125, "grad_norm": 2.640625, "grad_norm_var": 0.0154449462890625, "learning_rate": 0.0001, "loss": 7.8492, "loss/crossentropy": 2.1730494499206543, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.22336150705814362, "step": 4162 }, { "epoch": 0.26025, "grad_norm": 2.8125, "grad_norm_var": 0.0126861572265625, "learning_rate": 0.0001, "loss": 7.8619, "loss/crossentropy": 2.2360514402389526, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2400406002998352, "step": 4164 }, { "epoch": 0.260375, "grad_norm": 2.84375, "grad_norm_var": 0.016923014322916666, "learning_rate": 0.0001, "loss": 7.9982, "loss/crossentropy": 2.2315536737442017, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23362885415554047, "step": 4166 }, { "epoch": 0.2605, "grad_norm": 2.703125, "grad_norm_var": 0.017072550455729165, "learning_rate": 0.0001, "loss": 7.8935, "loss/crossentropy": 2.417250394821167, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2684229165315628, "step": 4168 }, { "epoch": 0.260625, "grad_norm": 2.78125, "grad_norm_var": 0.0119049072265625, "learning_rate": 0.0001, "loss": 7.9018, "loss/crossentropy": 2.388132095336914, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2596331611275673, "step": 4170 }, { "epoch": 0.26075, "grad_norm": 2.8125, "grad_norm_var": 0.0105377197265625, "learning_rate": 0.0001, "loss": 7.769, "loss/crossentropy": 2.159773111343384, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2218840792775154, "step": 4172 }, { "epoch": 0.260875, "grad_norm": 2.65625, "grad_norm_var": 0.011246744791666667, "learning_rate": 0.0001, "loss": 7.8979, "loss/crossentropy": 2.243951916694641, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23679761588573456, "step": 4174 }, { "epoch": 0.261, "grad_norm": 2.890625, "grad_norm_var": 0.0127838134765625, "learning_rate": 0.0001, "loss": 7.9364, "loss/crossentropy": 2.2255383729934692, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2511555105447769, "step": 4176 }, { "epoch": 0.261125, "grad_norm": 2.734375, "grad_norm_var": 0.01129150390625, "learning_rate": 0.0001, "loss": 7.7893, "loss/crossentropy": 1.9819644689559937, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2503453269600868, "step": 4178 }, { "epoch": 0.26125, "grad_norm": 2.875, "grad_norm_var": 0.0135406494140625, "learning_rate": 0.0001, "loss": 7.9879, "loss/crossentropy": 2.328287363052368, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23398541659116745, "step": 4180 }, { "epoch": 0.261375, "grad_norm": 2.734375, "grad_norm_var": 0.015339152018229166, "learning_rate": 0.0001, "loss": 7.8315, "loss/crossentropy": 2.2827576398849487, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2525269314646721, "step": 4182 }, { "epoch": 0.2615, "grad_norm": 2.890625, "grad_norm_var": 0.013939412434895833, "learning_rate": 0.0001, "loss": 7.9949, "loss/crossentropy": 2.303828716278076, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24650151282548904, "step": 4184 }, { "epoch": 0.261625, "grad_norm": 2.6875, "grad_norm_var": 0.0152740478515625, "learning_rate": 0.0001, "loss": 7.5549, "loss/crossentropy": 2.067635416984558, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2140519767999649, "step": 4186 }, { "epoch": 0.26175, "grad_norm": 2.828125, "grad_norm_var": 0.0132476806640625, "learning_rate": 0.0001, "loss": 8.019, "loss/crossentropy": 2.4719579219818115, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2614765018224716, "step": 4188 }, { "epoch": 0.261875, "grad_norm": 3.109375, "grad_norm_var": 0.016044108072916667, "learning_rate": 0.0001, "loss": 7.954, "loss/crossentropy": 2.421759843826294, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.31061503291130066, "step": 4190 }, { "epoch": 0.262, "grad_norm": 2.953125, "grad_norm_var": 0.0164703369140625, "learning_rate": 0.0001, "loss": 7.9522, "loss/crossentropy": 2.3707666397094727, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2634492740035057, "step": 4192 }, { "epoch": 0.262125, "grad_norm": 2.578125, "grad_norm_var": 0.022102864583333333, "learning_rate": 0.0001, "loss": 7.791, "loss/crossentropy": 2.5889461040496826, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26203466951847076, "step": 4194 }, { "epoch": 0.26225, "grad_norm": 2.921875, "grad_norm_var": 0.02027587890625, "learning_rate": 0.0001, "loss": 7.9226, "loss/crossentropy": 2.1494845151901245, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23617954552173615, "step": 4196 }, { "epoch": 0.262375, "grad_norm": 4.25, "grad_norm_var": 37.5479237874349, "learning_rate": 0.0001, "loss": 8.1442, "loss/crossentropy": 2.266987919807434, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.25409771502017975, "step": 4198 }, { "epoch": 0.2625, "grad_norm": 2.9375, "grad_norm_var": 37.492431640625, "learning_rate": 0.0001, "loss": 7.7821, "loss/crossentropy": 2.25630784034729, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2406470775604248, "step": 4200 }, { "epoch": 0.262625, "grad_norm": 2.875, "grad_norm_var": 37.42867431640625, "learning_rate": 0.0001, "loss": 7.8472, "loss/crossentropy": 1.7908839583396912, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25761212408542633, "step": 4202 }, { "epoch": 0.26275, "grad_norm": 2.828125, "grad_norm_var": 37.45735270182292, "learning_rate": 0.0001, "loss": 7.859, "loss/crossentropy": 2.322090983390808, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25410469621419907, "step": 4204 }, { "epoch": 0.262875, "grad_norm": 2.9375, "grad_norm_var": 37.46189778645833, "learning_rate": 0.0001, "loss": 8.0165, "loss/crossentropy": 2.1619476079940796, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22868604212999344, "step": 4206 }, { "epoch": 0.263, "grad_norm": 2.984375, "grad_norm_var": 37.407884724934895, "learning_rate": 0.0001, "loss": 7.9883, "loss/crossentropy": 2.281681537628174, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24005106836557388, "step": 4208 }, { "epoch": 0.263125, "grad_norm": 2.859375, "grad_norm_var": 37.25582682291667, "learning_rate": 0.0001, "loss": 8.0909, "loss/crossentropy": 2.1758521795272827, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24703175574541092, "step": 4210 }, { "epoch": 0.26325, "grad_norm": 2.796875, "grad_norm_var": 37.236812337239584, "learning_rate": 0.0001, "loss": 7.9892, "loss/crossentropy": 2.1196999549865723, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2565634548664093, "step": 4212 }, { "epoch": 0.263375, "grad_norm": 2.765625, "grad_norm_var": 0.016942342122395832, "learning_rate": 0.0001, "loss": 7.7252, "loss/crossentropy": 2.3114025592803955, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2358706369996071, "step": 4214 }, { "epoch": 0.2635, "grad_norm": 2.703125, "grad_norm_var": 0.018229166666666668, "learning_rate": 0.0001, "loss": 7.7132, "loss/crossentropy": 2.251305103302002, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26581814885139465, "step": 4216 }, { "epoch": 0.263625, "grad_norm": 2.765625, "grad_norm_var": 0.018505859375, "learning_rate": 0.0001, "loss": 7.8967, "loss/crossentropy": 2.29295814037323, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2516941875219345, "step": 4218 }, { "epoch": 0.26375, "grad_norm": 2.796875, "grad_norm_var": 0.016576131184895832, "learning_rate": 0.0001, "loss": 7.9078, "loss/crossentropy": 2.1747488379478455, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24901378899812698, "step": 4220 }, { "epoch": 0.263875, "grad_norm": 2.921875, "grad_norm_var": 0.016536458333333334, "learning_rate": 0.0001, "loss": 8.0671, "loss/crossentropy": 2.3936630487442017, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26541993021965027, "step": 4222 }, { "epoch": 0.264, "grad_norm": 2.875, "grad_norm_var": 0.012984212239583333, "learning_rate": 0.0001, "loss": 7.7962, "loss/crossentropy": 2.176760673522949, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2462717890739441, "step": 4224 }, { "epoch": 0.264125, "grad_norm": 2.890625, "grad_norm_var": 0.0095855712890625, "learning_rate": 0.0001, "loss": 8.0654, "loss/crossentropy": 2.3150908946990967, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2580142319202423, "step": 4226 }, { "epoch": 0.26425, "grad_norm": 2.796875, "grad_norm_var": 0.007698567708333334, "learning_rate": 0.0001, "loss": 7.9099, "loss/crossentropy": 2.3362441062927246, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2660368084907532, "step": 4228 }, { "epoch": 0.264375, "grad_norm": 3.203125, "grad_norm_var": 0.015771484375, "learning_rate": 0.0001, "loss": 8.15, "loss/crossentropy": 2.5000522136688232, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26757825911045074, "step": 4230 }, { "epoch": 0.2645, "grad_norm": 2.625, "grad_norm_var": 0.019733683268229166, "learning_rate": 0.0001, "loss": 7.8234, "loss/crossentropy": 2.315557360649109, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24116794019937515, "step": 4232 }, { "epoch": 0.264625, "grad_norm": 2.734375, "grad_norm_var": 0.022395833333333334, "learning_rate": 0.0001, "loss": 8.0532, "loss/crossentropy": 2.4713302850723267, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.25229300558567047, "step": 4234 }, { "epoch": 0.26475, "grad_norm": 2.640625, "grad_norm_var": 0.0325592041015625, "learning_rate": 0.0001, "loss": 7.9306, "loss/crossentropy": 2.288376569747925, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25349900871515274, "step": 4236 }, { "epoch": 0.264875, "grad_norm": 2.8125, "grad_norm_var": 0.030817667643229168, "learning_rate": 0.0001, "loss": 8.0319, "loss/crossentropy": 2.7155719995498657, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2824101895093918, "step": 4238 }, { "epoch": 0.265, "grad_norm": 3.59375, "grad_norm_var": 0.0611328125, "learning_rate": 0.0001, "loss": 8.1044, "loss/crossentropy": 2.4603116512298584, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.27050715684890747, "step": 4240 }, { "epoch": 0.265125, "grad_norm": 2.75, "grad_norm_var": 0.06310221354166666, "learning_rate": 0.0001, "loss": 7.9081, "loss/crossentropy": 2.46225106716156, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2663005590438843, "step": 4242 }, { "epoch": 0.26525, "grad_norm": 2.796875, "grad_norm_var": 0.06583658854166667, "learning_rate": 0.0001, "loss": 7.9625, "loss/crossentropy": 2.529794931411743, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2477777898311615, "step": 4244 }, { "epoch": 0.265375, "grad_norm": 2.765625, "grad_norm_var": 0.0628814697265625, "learning_rate": 0.0001, "loss": 7.7489, "loss/crossentropy": 2.095444083213806, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.21033120155334473, "step": 4246 }, { "epoch": 0.2655, "grad_norm": 2.828125, "grad_norm_var": 0.057428995768229164, "learning_rate": 0.0001, "loss": 8.029, "loss/crossentropy": 2.6816264390945435, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2776796221733093, "step": 4248 }, { "epoch": 0.265625, "grad_norm": 2.765625, "grad_norm_var": 0.05705464680989583, "learning_rate": 0.0001, "loss": 7.7089, "loss/crossentropy": 2.2003982067108154, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23354927450418472, "step": 4250 }, { "epoch": 0.26575, "grad_norm": 2.90625, "grad_norm_var": 0.046996053059895834, "learning_rate": 0.0001, "loss": 8.0658, "loss/crossentropy": 2.415517210960388, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25632981956005096, "step": 4252 }, { "epoch": 0.265875, "grad_norm": 3.0625, "grad_norm_var": 0.049540201822916664, "learning_rate": 0.0001, "loss": 7.9228, "loss/crossentropy": 2.495467185974121, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24749217182397842, "step": 4254 }, { "epoch": 0.266, "grad_norm": 2.671875, "grad_norm_var": 0.01474609375, "learning_rate": 0.0001, "loss": 7.7544, "loss/crossentropy": 2.2713639736175537, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24235820770263672, "step": 4256 }, { "epoch": 0.266125, "grad_norm": 2.671875, "grad_norm_var": 0.016218058268229165, "learning_rate": 0.0001, "loss": 7.6381, "loss/crossentropy": 2.1717538833618164, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2394236922264099, "step": 4258 }, { "epoch": 0.26625, "grad_norm": 2.6875, "grad_norm_var": 0.0172271728515625, "learning_rate": 0.0001, "loss": 7.8459, "loss/crossentropy": 2.148439586162567, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.238677978515625, "step": 4260 }, { "epoch": 0.266375, "grad_norm": 2.671875, "grad_norm_var": 0.019725545247395834, "learning_rate": 0.0001, "loss": 7.6574, "loss/crossentropy": 2.009226441383362, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2179558053612709, "step": 4262 }, { "epoch": 0.2665, "grad_norm": 2.796875, "grad_norm_var": 0.017072550455729165, "learning_rate": 0.0001, "loss": 8.0312, "loss/crossentropy": 2.5849982500076294, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26629510521888733, "step": 4264 }, { "epoch": 0.266625, "grad_norm": 2.84375, "grad_norm_var": 0.0183746337890625, "learning_rate": 0.0001, "loss": 7.7015, "loss/crossentropy": 2.280380964279175, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23930377513170242, "step": 4266 }, { "epoch": 0.26675, "grad_norm": 2.71875, "grad_norm_var": 0.018424479166666667, "learning_rate": 0.0001, "loss": 7.717, "loss/crossentropy": 2.2330468893051147, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24855167418718338, "step": 4268 }, { "epoch": 0.266875, "grad_norm": 2.859375, "grad_norm_var": 0.013548787434895833, "learning_rate": 0.0001, "loss": 7.8267, "loss/crossentropy": 1.9139932990074158, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2389245703816414, "step": 4270 }, { "epoch": 0.267, "grad_norm": 2.953125, "grad_norm_var": 0.014435831705729167, "learning_rate": 0.0001, "loss": 7.7517, "loss/crossentropy": 2.241062879562378, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2435523197054863, "step": 4272 }, { "epoch": 0.267125, "grad_norm": 2.78125, "grad_norm_var": 0.013802083333333333, "learning_rate": 0.0001, "loss": 7.8894, "loss/crossentropy": 2.0998335480690002, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23158778250217438, "step": 4274 }, { "epoch": 0.26725, "grad_norm": 2.921875, "grad_norm_var": 0.013556925455729167, "learning_rate": 0.0001, "loss": 8.0106, "loss/crossentropy": 2.196057438850403, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24100814759731293, "step": 4276 }, { "epoch": 0.267375, "grad_norm": 2.828125, "grad_norm_var": 0.009814453125, "learning_rate": 0.0001, "loss": 7.9131, "loss/crossentropy": 2.0391287803649902, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22870135307312012, "step": 4278 }, { "epoch": 0.2675, "grad_norm": 2.875, "grad_norm_var": 0.0099273681640625, "learning_rate": 0.0001, "loss": 7.9463, "loss/crossentropy": 2.24883496761322, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25473955273628235, "step": 4280 }, { "epoch": 0.267625, "grad_norm": 2.875, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 7.8434, "loss/crossentropy": 2.1515504121780396, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2581338733434677, "step": 4282 }, { "epoch": 0.26775, "grad_norm": 2.828125, "grad_norm_var": 0.009501139322916666, "learning_rate": 0.0001, "loss": 7.7375, "loss/crossentropy": 2.1803826093673706, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24130553007125854, "step": 4284 }, { "epoch": 0.267875, "grad_norm": 2.859375, "grad_norm_var": 0.0082183837890625, "learning_rate": 0.0001, "loss": 7.8611, "loss/crossentropy": 2.196977734565735, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24320388585329056, "step": 4286 }, { "epoch": 0.268, "grad_norm": 2.984375, "grad_norm_var": 0.009956868489583333, "learning_rate": 0.0001, "loss": 7.6155, "loss/crossentropy": 2.17727530002594, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25908537209033966, "step": 4288 }, { "epoch": 0.268125, "grad_norm": 2.703125, "grad_norm_var": 0.008348592122395833, "learning_rate": 0.0001, "loss": 7.8134, "loss/crossentropy": 2.053568482398987, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23145954310894012, "step": 4290 }, { "epoch": 0.26825, "grad_norm": 2.90625, "grad_norm_var": 0.009195963541666666, "learning_rate": 0.0001, "loss": 8.1768, "loss/crossentropy": 2.613425374031067, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.258578285574913, "step": 4292 }, { "epoch": 0.268375, "grad_norm": 2.9375, "grad_norm_var": 0.009147135416666667, "learning_rate": 0.0001, "loss": 7.9366, "loss/crossentropy": 2.2620270252227783, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.246913842856884, "step": 4294 }, { "epoch": 0.2685, "grad_norm": 2.71875, "grad_norm_var": 0.01041259765625, "learning_rate": 0.0001, "loss": 8.0019, "loss/crossentropy": 2.3790470361709595, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25181396305561066, "step": 4296 }, { "epoch": 0.268625, "grad_norm": 2.703125, "grad_norm_var": 0.011408487955729166, "learning_rate": 0.0001, "loss": 7.8053, "loss/crossentropy": 2.0822721123695374, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24697626382112503, "step": 4298 }, { "epoch": 0.26875, "grad_norm": 2.703125, "grad_norm_var": 0.011865234375, "learning_rate": 0.0001, "loss": 8.0199, "loss/crossentropy": 2.3127192854881287, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2468092292547226, "step": 4300 }, { "epoch": 0.268875, "grad_norm": 2.890625, "grad_norm_var": 0.01412353515625, "learning_rate": 0.0001, "loss": 7.8368, "loss/crossentropy": 2.2731138467788696, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.21551719307899475, "step": 4302 }, { "epoch": 0.269, "grad_norm": 2.765625, "grad_norm_var": 0.012300618489583333, "learning_rate": 0.0001, "loss": 7.9, "loss/crossentropy": 2.054975211620331, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24493585526943207, "step": 4304 }, { "epoch": 0.269125, "grad_norm": 3.09375, "grad_norm_var": 0.0177886962890625, "learning_rate": 0.0001, "loss": 8.0046, "loss/crossentropy": 2.2912334203720093, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.274725005030632, "step": 4306 }, { "epoch": 0.26925, "grad_norm": 2.703125, "grad_norm_var": 0.0199859619140625, "learning_rate": 0.0001, "loss": 7.91, "loss/crossentropy": 2.160645306110382, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2310664877295494, "step": 4308 }, { "epoch": 0.269375, "grad_norm": 2.8125, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 7.9594, "loss/crossentropy": 2.431759834289551, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23952233791351318, "step": 4310 }, { "epoch": 0.2695, "grad_norm": 3.078125, "grad_norm_var": 0.029588826497395835, "learning_rate": 0.0001, "loss": 7.639, "loss/crossentropy": 2.058727204799652, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2312949374318123, "step": 4312 }, { "epoch": 0.269625, "grad_norm": 2.765625, "grad_norm_var": 0.0277496337890625, "learning_rate": 0.0001, "loss": 7.8787, "loss/crossentropy": 2.32631516456604, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2474951222538948, "step": 4314 }, { "epoch": 0.26975, "grad_norm": 2.59375, "grad_norm_var": 0.032450358072916664, "learning_rate": 0.0001, "loss": 7.6296, "loss/crossentropy": 2.101090967655182, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.242124542593956, "step": 4316 }, { "epoch": 0.269875, "grad_norm": 2.890625, "grad_norm_var": 0.030573527018229168, "learning_rate": 0.0001, "loss": 8.0219, "loss/crossentropy": 2.322851777076721, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.25095701962709427, "step": 4318 }, { "epoch": 0.27, "grad_norm": 2.796875, "grad_norm_var": 0.0295562744140625, "learning_rate": 0.0001, "loss": 7.6156, "loss/crossentropy": 2.236197590827942, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2439088299870491, "step": 4320 }, { "epoch": 0.270125, "grad_norm": 2.640625, "grad_norm_var": 0.031468709309895836, "learning_rate": 0.0001, "loss": 7.7685, "loss/crossentropy": 2.2455419301986694, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22935181856155396, "step": 4322 }, { "epoch": 0.27025, "grad_norm": 2.671875, "grad_norm_var": 0.02935791015625, "learning_rate": 0.0001, "loss": 7.8108, "loss/crossentropy": 2.286752223968506, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24976222962141037, "step": 4324 }, { "epoch": 0.270375, "grad_norm": 2.75, "grad_norm_var": 0.030562337239583334, "learning_rate": 0.0001, "loss": 7.6777, "loss/crossentropy": 2.2670631408691406, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24830525368452072, "step": 4326 }, { "epoch": 0.2705, "grad_norm": 2.78125, "grad_norm_var": 0.020221964518229166, "learning_rate": 0.0001, "loss": 8.0639, "loss/crossentropy": 2.243753433227539, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24500297009944916, "step": 4328 }, { "epoch": 0.270625, "grad_norm": 3.109375, "grad_norm_var": 0.0255767822265625, "learning_rate": 0.0001, "loss": 7.9603, "loss/crossentropy": 1.9934114217758179, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2386907860636711, "step": 4330 }, { "epoch": 0.27075, "grad_norm": 2.71875, "grad_norm_var": 0.026953125, "learning_rate": 0.0001, "loss": 7.7582, "loss/crossentropy": 2.0450265407562256, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23710596561431885, "step": 4332 }, { "epoch": 0.270875, "grad_norm": 2.828125, "grad_norm_var": 0.025877888997395834, "learning_rate": 0.0001, "loss": 7.8787, "loss/crossentropy": 2.2991667985916138, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2548564076423645, "step": 4334 }, { "epoch": 0.271, "grad_norm": 2.90625, "grad_norm_var": 0.026057942708333334, "learning_rate": 0.0001, "loss": 7.9276, "loss/crossentropy": 2.1133607625961304, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2315070927143097, "step": 4336 }, { "epoch": 0.271125, "grad_norm": 2.8125, "grad_norm_var": 0.0191802978515625, "learning_rate": 0.0001, "loss": 8.0528, "loss/crossentropy": 2.3985393047332764, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.27966225147247314, "step": 4338 }, { "epoch": 0.27125, "grad_norm": 2.921875, "grad_norm_var": 0.018782552083333334, "learning_rate": 0.0001, "loss": 7.9686, "loss/crossentropy": 2.226362109184265, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2630234956741333, "step": 4340 }, { "epoch": 0.271375, "grad_norm": 2.9375, "grad_norm_var": 0.017772420247395834, "learning_rate": 0.0001, "loss": 7.8807, "loss/crossentropy": 2.2515182495117188, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.22476138174533844, "step": 4342 }, { "epoch": 0.2715, "grad_norm": 2.90625, "grad_norm_var": 0.012239583333333333, "learning_rate": 0.0001, "loss": 7.9125, "loss/crossentropy": 2.3261423110961914, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.246418759226799, "step": 4344 }, { "epoch": 0.271625, "grad_norm": 2.65625, "grad_norm_var": 0.010920206705729166, "learning_rate": 0.0001, "loss": 7.8189, "loss/crossentropy": 2.291730523109436, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23340294510126114, "step": 4346 }, { "epoch": 0.27175, "grad_norm": 3.03125, "grad_norm_var": 0.0102203369140625, "learning_rate": 0.0001, "loss": 7.9663, "loss/crossentropy": 2.515085816383362, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2662329450249672, "step": 4348 }, { "epoch": 0.271875, "grad_norm": 2.8125, "grad_norm_var": 0.01005859375, "learning_rate": 0.0001, "loss": 7.8887, "loss/crossentropy": 2.2541251182556152, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2438943237066269, "step": 4350 }, { "epoch": 0.272, "grad_norm": 2.796875, "grad_norm_var": 0.011042277018229166, "learning_rate": 0.0001, "loss": 7.7585, "loss/crossentropy": 2.2722275257110596, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2622324079275131, "step": 4352 }, { "epoch": 0.272125, "grad_norm": 2.765625, "grad_norm_var": 0.011286417643229166, "learning_rate": 0.0001, "loss": 7.9619, "loss/crossentropy": 2.3624770641326904, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24193863570690155, "step": 4354 }, { "epoch": 0.27225, "grad_norm": 2.84375, "grad_norm_var": 0.012723795572916667, "learning_rate": 0.0001, "loss": 7.8458, "loss/crossentropy": 2.0874279737472534, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23463068902492523, "step": 4356 }, { "epoch": 0.272375, "grad_norm": 2.84375, "grad_norm_var": 0.013212076822916667, "learning_rate": 0.0001, "loss": 8.1091, "loss/crossentropy": 2.5377037525177, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26963868737220764, "step": 4358 }, { "epoch": 0.2725, "grad_norm": 2.75, "grad_norm_var": 0.0118804931640625, "learning_rate": 0.0001, "loss": 7.845, "loss/crossentropy": 2.3017483949661255, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23473241925239563, "step": 4360 }, { "epoch": 0.272625, "grad_norm": 2.84375, "grad_norm_var": 0.011498006184895833, "learning_rate": 0.0001, "loss": 7.811, "loss/crossentropy": 2.403724431991577, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2441314458847046, "step": 4362 }, { "epoch": 0.27275, "grad_norm": 2.75, "grad_norm_var": 0.0100982666015625, "learning_rate": 0.0001, "loss": 7.8422, "loss/crossentropy": 2.1351675987243652, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2546608969569206, "step": 4364 }, { "epoch": 0.272875, "grad_norm": 2.78125, "grad_norm_var": 0.010497029622395833, "learning_rate": 0.0001, "loss": 7.7895, "loss/crossentropy": 2.2290525436401367, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24564901739358902, "step": 4366 }, { "epoch": 0.273, "grad_norm": 2.75, "grad_norm_var": 0.009276326497395833, "learning_rate": 0.0001, "loss": 7.787, "loss/crossentropy": 2.3624547719955444, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2411060780286789, "step": 4368 }, { "epoch": 0.273125, "grad_norm": 2.625, "grad_norm_var": 0.012044270833333334, "learning_rate": 0.0001, "loss": 7.7219, "loss/crossentropy": 2.3433109521865845, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2539641708135605, "step": 4370 }, { "epoch": 0.27325, "grad_norm": 2.609375, "grad_norm_var": 0.013602701822916667, "learning_rate": 0.0001, "loss": 7.8149, "loss/crossentropy": 2.186747908592224, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24265237152576447, "step": 4372 }, { "epoch": 0.273375, "grad_norm": 2.6875, "grad_norm_var": 0.011156209309895833, "learning_rate": 0.0001, "loss": 7.8105, "loss/crossentropy": 2.099400222301483, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2536737322807312, "step": 4374 }, { "epoch": 0.2735, "grad_norm": 3.171875, "grad_norm_var": 0.020457967122395834, "learning_rate": 0.0001, "loss": 8.1088, "loss/crossentropy": 2.4713211059570312, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.245501309633255, "step": 4376 }, { "epoch": 0.273625, "grad_norm": 2.890625, "grad_norm_var": 0.017707316080729167, "learning_rate": 0.0001, "loss": 7.9415, "loss/crossentropy": 2.414775848388672, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.260262668132782, "step": 4378 }, { "epoch": 0.27375, "grad_norm": 2.6875, "grad_norm_var": 0.01943359375, "learning_rate": 0.0001, "loss": 7.8479, "loss/crossentropy": 2.19328773021698, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.21873796731233597, "step": 4380 }, { "epoch": 0.273875, "grad_norm": 2.75, "grad_norm_var": 0.019820149739583334, "learning_rate": 0.0001, "loss": 7.5374, "loss/crossentropy": 2.3443480730056763, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.25173987448215485, "step": 4382 }, { "epoch": 0.274, "grad_norm": 2.921875, "grad_norm_var": 0.0206939697265625, "learning_rate": 0.0001, "loss": 7.9702, "loss/crossentropy": 2.511129379272461, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25115419924259186, "step": 4384 }, { "epoch": 0.274125, "grad_norm": 2.890625, "grad_norm_var": 0.0194244384765625, "learning_rate": 0.0001, "loss": 7.963, "loss/crossentropy": 2.435807943344116, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24563057720661163, "step": 4386 }, { "epoch": 0.27425, "grad_norm": 3.03125, "grad_norm_var": 0.020905558268229166, "learning_rate": 0.0001, "loss": 7.7854, "loss/crossentropy": 2.271300435066223, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24015212804079056, "step": 4388 }, { "epoch": 0.274375, "grad_norm": 2.828125, "grad_norm_var": 0.021727498372395834, "learning_rate": 0.0001, "loss": 7.8126, "loss/crossentropy": 2.4717822074890137, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2600596249103546, "step": 4390 }, { "epoch": 0.2745, "grad_norm": 3.171875, "grad_norm_var": 0.0215728759765625, "learning_rate": 0.0001, "loss": 7.8985, "loss/crossentropy": 2.4676023721694946, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2669646441936493, "step": 4392 }, { "epoch": 0.274625, "grad_norm": 2.5625, "grad_norm_var": 0.027684529622395832, "learning_rate": 0.0001, "loss": 7.7801, "loss/crossentropy": 2.317070960998535, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25364139676094055, "step": 4394 }, { "epoch": 0.27475, "grad_norm": 2.71875, "grad_norm_var": 0.026708984375, "learning_rate": 0.0001, "loss": 7.7771, "loss/crossentropy": 2.2764382362365723, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24154330044984818, "step": 4396 }, { "epoch": 0.274875, "grad_norm": 3.0625, "grad_norm_var": 0.027701822916666667, "learning_rate": 0.0001, "loss": 8.0874, "loss/crossentropy": 2.1570043563842773, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2519639804959297, "step": 4398 }, { "epoch": 0.275, "grad_norm": 2.75, "grad_norm_var": 0.0288970947265625, "learning_rate": 0.0001, "loss": 7.9668, "loss/crossentropy": 2.2437803745269775, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23716624081134796, "step": 4400 }, { "epoch": 0.275125, "grad_norm": 2.8125, "grad_norm_var": 0.028807576497395834, "learning_rate": 0.0001, "loss": 7.8597, "loss/crossentropy": 2.3062546253204346, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24317392706871033, "step": 4402 }, { "epoch": 0.27525, "grad_norm": 2.984375, "grad_norm_var": 0.029515584309895832, "learning_rate": 0.0001, "loss": 7.8313, "loss/crossentropy": 2.227596402168274, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.21904651820659637, "step": 4404 }, { "epoch": 0.275375, "grad_norm": 2.921875, "grad_norm_var": 0.026122029622395834, "learning_rate": 0.0001, "loss": 7.931, "loss/crossentropy": 2.525656580924988, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23951703310012817, "step": 4406 }, { "epoch": 0.2755, "grad_norm": 2.734375, "grad_norm_var": 0.017975870768229166, "learning_rate": 0.0001, "loss": 7.9898, "loss/crossentropy": 2.3638256788253784, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24081310629844666, "step": 4408 }, { "epoch": 0.275625, "grad_norm": 2.890625, "grad_norm_var": 0.0179595947265625, "learning_rate": 0.0001, "loss": 7.5821, "loss/crossentropy": 2.177635431289673, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24490711838006973, "step": 4410 }, { "epoch": 0.27575, "grad_norm": 2.984375, "grad_norm_var": 0.017601521809895833, "learning_rate": 0.0001, "loss": 8.0015, "loss/crossentropy": 2.110883593559265, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2512647360563278, "step": 4412 }, { "epoch": 0.275875, "grad_norm": 3.03125, "grad_norm_var": 0.018610636393229168, "learning_rate": 0.0001, "loss": 7.8356, "loss/crossentropy": 2.4385801553726196, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24002321809530258, "step": 4414 }, { "epoch": 0.276, "grad_norm": 2.890625, "grad_norm_var": 0.018781534830729165, "learning_rate": 0.0001, "loss": 7.6569, "loss/crossentropy": 2.242212653160095, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2328862026333809, "step": 4416 }, { "epoch": 0.276125, "grad_norm": 2.71875, "grad_norm_var": 0.0196197509765625, "learning_rate": 0.0001, "loss": 8.0125, "loss/crossentropy": 2.2374762296676636, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2608456015586853, "step": 4418 }, { "epoch": 0.27625, "grad_norm": 2.8125, "grad_norm_var": 0.0154296875, "learning_rate": 0.0001, "loss": 7.7844, "loss/crossentropy": 2.2297643423080444, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.22976786643266678, "step": 4420 }, { "epoch": 0.276375, "grad_norm": 2.96875, "grad_norm_var": 0.019352213541666666, "learning_rate": 0.0001, "loss": 8.0492, "loss/crossentropy": 2.2385973930358887, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2559010833501816, "step": 4422 }, { "epoch": 0.2765, "grad_norm": 2.828125, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 7.9282, "loss/crossentropy": 2.3769255876541138, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2570715621113777, "step": 4424 }, { "epoch": 0.276625, "grad_norm": 11.8125, "grad_norm_var": 5.01842041015625, "learning_rate": 0.0001, "loss": 8.4578, "loss/crossentropy": 2.452933430671692, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27480727434158325, "step": 4426 }, { "epoch": 0.27675, "grad_norm": 3.09375, "grad_norm_var": 4.997378540039063, "learning_rate": 0.0001, "loss": 7.7989, "loss/crossentropy": 2.1652350425720215, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2554679661989212, "step": 4428 }, { "epoch": 0.276875, "grad_norm": 2.765625, "grad_norm_var": 4.998335774739584, "learning_rate": 0.0001, "loss": 7.8489, "loss/crossentropy": 2.27901029586792, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.252231702208519, "step": 4430 }, { "epoch": 0.277, "grad_norm": 2.671875, "grad_norm_var": 5.025942993164063, "learning_rate": 0.0001, "loss": 7.8509, "loss/crossentropy": 2.250112295150757, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24269716441631317, "step": 4432 }, { "epoch": 0.277125, "grad_norm": 3.015625, "grad_norm_var": 5.0053049723307295, "learning_rate": 0.0001, "loss": 7.8296, "loss/crossentropy": 2.2260528802871704, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23317068815231323, "step": 4434 }, { "epoch": 0.27725, "grad_norm": 2.71875, "grad_norm_var": 5.0169423421223955, "learning_rate": 0.0001, "loss": 7.9135, "loss/crossentropy": 2.2177956104278564, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24023041874170303, "step": 4436 }, { "epoch": 0.277375, "grad_norm": 2.59375, "grad_norm_var": 5.074128214518229, "learning_rate": 0.0001, "loss": 7.8935, "loss/crossentropy": 2.5362316370010376, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2523645684123039, "step": 4438 }, { "epoch": 0.2775, "grad_norm": 2.625, "grad_norm_var": 5.099312337239583, "learning_rate": 0.0001, "loss": 7.688, "loss/crossentropy": 2.1544612646102905, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.241964653134346, "step": 4440 }, { "epoch": 0.277625, "grad_norm": 2.90625, "grad_norm_var": 0.10777587890625, "learning_rate": 0.0001, "loss": 7.7551, "loss/crossentropy": 2.1552851796150208, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23890065401792526, "step": 4442 }, { "epoch": 0.27775, "grad_norm": 2.6875, "grad_norm_var": 0.030810546875, "learning_rate": 0.0001, "loss": 7.8063, "loss/crossentropy": 2.278067708015442, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23785259574651718, "step": 4444 }, { "epoch": 0.277875, "grad_norm": 2.828125, "grad_norm_var": 0.032063802083333336, "learning_rate": 0.0001, "loss": 7.813, "loss/crossentropy": 2.3867409229278564, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2522669658064842, "step": 4446 }, { "epoch": 0.278, "grad_norm": 2.734375, "grad_norm_var": 0.031403605143229166, "learning_rate": 0.0001, "loss": 7.8366, "loss/crossentropy": 2.4174715280532837, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2608378678560257, "step": 4448 }, { "epoch": 0.278125, "grad_norm": 2.71875, "grad_norm_var": 0.027546183268229166, "learning_rate": 0.0001, "loss": 7.9311, "loss/crossentropy": 2.2817476987838745, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.28454217314720154, "step": 4450 }, { "epoch": 0.27825, "grad_norm": 2.609375, "grad_norm_var": 0.028571573893229167, "learning_rate": 0.0001, "loss": 7.7523, "loss/crossentropy": 2.2681050300598145, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2305116131901741, "step": 4452 }, { "epoch": 0.278375, "grad_norm": 2.765625, "grad_norm_var": 0.026764933268229166, "learning_rate": 0.0001, "loss": 8.0617, "loss/crossentropy": 2.3880372047424316, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2551140636205673, "step": 4454 }, { "epoch": 0.2785, "grad_norm": 3.375, "grad_norm_var": 0.046219889322916666, "learning_rate": 0.0001, "loss": 7.9784, "loss/crossentropy": 2.243477165699005, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2374018207192421, "step": 4456 }, { "epoch": 0.278625, "grad_norm": 2.890625, "grad_norm_var": 0.03169657389322917, "learning_rate": 0.0001, "loss": 8.0035, "loss/crossentropy": 2.093716263771057, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27225764095783234, "step": 4458 }, { "epoch": 0.27875, "grad_norm": 2.65625, "grad_norm_var": 0.03313395182291667, "learning_rate": 0.0001, "loss": 7.9323, "loss/crossentropy": 2.414399743080139, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2585732489824295, "step": 4460 }, { "epoch": 0.278875, "grad_norm": 2.703125, "grad_norm_var": 0.0335845947265625, "learning_rate": 0.0001, "loss": 7.8053, "loss/crossentropy": 2.370185613632202, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24910618364810944, "step": 4462 }, { "epoch": 0.279, "grad_norm": 2.921875, "grad_norm_var": 0.03368733723958333, "learning_rate": 0.0001, "loss": 7.925, "loss/crossentropy": 2.2157377004623413, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2450287863612175, "step": 4464 }, { "epoch": 0.279125, "grad_norm": 2.84375, "grad_norm_var": 0.0333404541015625, "learning_rate": 0.0001, "loss": 7.8908, "loss/crossentropy": 2.1479971408843994, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23871061950922012, "step": 4466 }, { "epoch": 0.27925, "grad_norm": 2.921875, "grad_norm_var": 0.03318583170572917, "learning_rate": 0.0001, "loss": 7.9178, "loss/crossentropy": 2.439133882522583, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2338852807879448, "step": 4468 }, { "epoch": 0.279375, "grad_norm": 2.65625, "grad_norm_var": 0.03595377604166667, "learning_rate": 0.0001, "loss": 7.994, "loss/crossentropy": 2.439359426498413, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25949567556381226, "step": 4470 }, { "epoch": 0.2795, "grad_norm": 2.953125, "grad_norm_var": 0.019172159830729167, "learning_rate": 0.0001, "loss": 7.726, "loss/crossentropy": 2.1314167380332947, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2279723584651947, "step": 4472 }, { "epoch": 0.279625, "grad_norm": 2.71875, "grad_norm_var": 0.018366495768229168, "learning_rate": 0.0001, "loss": 8.0947, "loss/crossentropy": 2.388408660888672, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24749407172203064, "step": 4474 }, { "epoch": 0.27975, "grad_norm": 2.53125, "grad_norm_var": 0.022948201497395834, "learning_rate": 0.0001, "loss": 7.7714, "loss/crossentropy": 2.346164107322693, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2525106370449066, "step": 4476 }, { "epoch": 0.279875, "grad_norm": 2.984375, "grad_norm_var": 0.024657185872395834, "learning_rate": 0.0001, "loss": 8.0466, "loss/crossentropy": 2.3998990058898926, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25072547793388367, "step": 4478 }, { "epoch": 0.28, "grad_norm": 2.6875, "grad_norm_var": 0.029520670572916668, "learning_rate": 0.0001, "loss": 7.9708, "loss/crossentropy": 2.4698004722595215, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24102972447872162, "step": 4480 }, { "epoch": 0.280125, "grad_norm": 3.046875, "grad_norm_var": 0.031148274739583332, "learning_rate": 0.0001, "loss": 7.8292, "loss/crossentropy": 2.158740282058716, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22618430107831955, "step": 4482 }, { "epoch": 0.28025, "grad_norm": 2.484375, "grad_norm_var": 0.034455362955729166, "learning_rate": 0.0001, "loss": 7.8979, "loss/crossentropy": 2.408037781715393, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2516814023256302, "step": 4484 }, { "epoch": 0.280375, "grad_norm": 2.78125, "grad_norm_var": 0.03159891764322917, "learning_rate": 0.0001, "loss": 7.7633, "loss/crossentropy": 2.234240174293518, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2487160712480545, "step": 4486 }, { "epoch": 0.2805, "grad_norm": 2.703125, "grad_norm_var": 0.03135477701822917, "learning_rate": 0.0001, "loss": 7.989, "loss/crossentropy": 2.3690484762191772, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23876956850290298, "step": 4488 }, { "epoch": 0.280625, "grad_norm": 2.78125, "grad_norm_var": 0.03092041015625, "learning_rate": 0.0001, "loss": 7.7085, "loss/crossentropy": 2.08196222782135, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2360798791050911, "step": 4490 }, { "epoch": 0.28075, "grad_norm": 2.71875, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 8.051, "loss/crossentropy": 2.489022374153137, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25915856659412384, "step": 4492 }, { "epoch": 0.280875, "grad_norm": 2.84375, "grad_norm_var": 0.02080078125, "learning_rate": 0.0001, "loss": 7.7337, "loss/crossentropy": 2.2085322737693787, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2296338826417923, "step": 4494 }, { "epoch": 0.281, "grad_norm": 4.1875, "grad_norm_var": 0.14198811848958334, "learning_rate": 0.0001, "loss": 7.8112, "loss/crossentropy": 2.470055937767029, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24330776929855347, "step": 4496 }, { "epoch": 0.281125, "grad_norm": 2.71875, "grad_norm_var": 0.14814351399739584, "learning_rate": 0.0001, "loss": 8.1268, "loss/crossentropy": 2.281893491744995, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2655613273382187, "step": 4498 }, { "epoch": 0.28125, "grad_norm": 2.8125, "grad_norm_var": 0.141845703125, "learning_rate": 0.0001, "loss": 7.9408, "loss/crossentropy": 2.279741406440735, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.26126067340373993, "step": 4500 }, { "epoch": 0.281375, "grad_norm": 2.515625, "grad_norm_var": 0.14840087890625, "learning_rate": 0.0001, "loss": 7.7975, "loss/crossentropy": 2.2175263166427612, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2331373244524002, "step": 4502 }, { "epoch": 0.2815, "grad_norm": 2.71875, "grad_norm_var": 0.1498046875, "learning_rate": 0.0001, "loss": 7.8458, "loss/crossentropy": 2.3390674591064453, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22605808824300766, "step": 4504 }, { "epoch": 0.281625, "grad_norm": 3.0625, "grad_norm_var": 0.1538726806640625, "learning_rate": 0.0001, "loss": 7.8952, "loss/crossentropy": 2.3547202348709106, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2439187616109848, "step": 4506 }, { "epoch": 0.28175, "grad_norm": 2.75, "grad_norm_var": 0.15526936848958334, "learning_rate": 0.0001, "loss": 7.6462, "loss/crossentropy": 2.208525776863098, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23356673121452332, "step": 4508 }, { "epoch": 0.281875, "grad_norm": 2.875, "grad_norm_var": 0.15448811848958333, "learning_rate": 0.0001, "loss": 7.7804, "loss/crossentropy": 2.246667981147766, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.22609598189592361, "step": 4510 }, { "epoch": 0.282, "grad_norm": 2.734375, "grad_norm_var": 0.030231730143229166, "learning_rate": 0.0001, "loss": 7.5631, "loss/crossentropy": 2.099067807197571, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2352505549788475, "step": 4512 }, { "epoch": 0.282125, "grad_norm": 2.640625, "grad_norm_var": 0.019856770833333332, "learning_rate": 0.0001, "loss": 7.6429, "loss/crossentropy": 2.238911986351013, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24882740527391434, "step": 4514 }, { "epoch": 0.28225, "grad_norm": 2.890625, "grad_norm_var": 0.019709269205729168, "learning_rate": 0.0001, "loss": 7.9015, "loss/crossentropy": 2.250125765800476, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24813418090343475, "step": 4516 }, { "epoch": 0.282375, "grad_norm": 2.859375, "grad_norm_var": 0.0155914306640625, "learning_rate": 0.0001, "loss": 7.7124, "loss/crossentropy": 2.4621388912200928, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2333867847919464, "step": 4518 }, { "epoch": 0.2825, "grad_norm": 2.84375, "grad_norm_var": 0.014623006184895834, "learning_rate": 0.0001, "loss": 7.7644, "loss/crossentropy": 2.4793641567230225, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23019622266292572, "step": 4520 }, { "epoch": 0.282625, "grad_norm": 2.78125, "grad_norm_var": 0.005973307291666666, "learning_rate": 0.0001, "loss": 8.1334, "loss/crossentropy": 2.43166184425354, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2504468262195587, "step": 4522 }, { "epoch": 0.28275, "grad_norm": 2.703125, "grad_norm_var": 0.005134073893229166, "learning_rate": 0.0001, "loss": 7.9746, "loss/crossentropy": 2.5419434309005737, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.27035558223724365, "step": 4524 }, { "epoch": 0.282875, "grad_norm": 2.71875, "grad_norm_var": 0.005647786458333333, "learning_rate": 0.0001, "loss": 7.825, "loss/crossentropy": 2.4308319091796875, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2315927892923355, "step": 4526 }, { "epoch": 0.283, "grad_norm": 2.84375, "grad_norm_var": 0.0098785400390625, "learning_rate": 0.0001, "loss": 7.9547, "loss/crossentropy": 2.2518080472946167, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23106548935174942, "step": 4528 }, { "epoch": 0.283125, "grad_norm": 2.625, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 7.6979, "loss/crossentropy": 2.131285071372986, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23756247013807297, "step": 4530 }, { "epoch": 0.28325, "grad_norm": 2.890625, "grad_norm_var": 0.011790974934895834, "learning_rate": 0.0001, "loss": 8.0143, "loss/crossentropy": 2.3552494049072266, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2628362476825714, "step": 4532 }, { "epoch": 0.283375, "grad_norm": 2.90625, "grad_norm_var": 0.012821451822916666, "learning_rate": 0.0001, "loss": 7.8703, "loss/crossentropy": 2.254002094268799, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24302760511636734, "step": 4534 }, { "epoch": 0.2835, "grad_norm": 3.171875, "grad_norm_var": 0.023063151041666667, "learning_rate": 0.0001, "loss": 8.0597, "loss/crossentropy": 2.361047863960266, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25740326941013336, "step": 4536 }, { "epoch": 0.283625, "grad_norm": 2.65625, "grad_norm_var": 0.024409993489583334, "learning_rate": 0.0001, "loss": 7.9392, "loss/crossentropy": 2.4161765575408936, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25404225289821625, "step": 4538 }, { "epoch": 0.28375, "grad_norm": 2.765625, "grad_norm_var": 0.024039713541666667, "learning_rate": 0.0001, "loss": 7.7441, "loss/crossentropy": 2.0250840187072754, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2805088981986046, "step": 4540 }, { "epoch": 0.283875, "grad_norm": 2.59375, "grad_norm_var": 0.0249664306640625, "learning_rate": 0.0001, "loss": 7.692, "loss/crossentropy": 2.0919052362442017, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23983000218868256, "step": 4542 }, { "epoch": 0.284, "grad_norm": 2.78125, "grad_norm_var": 0.0195953369140625, "learning_rate": 0.0001, "loss": 7.5448, "loss/crossentropy": 1.8733795285224915, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2008344978094101, "step": 4544 }, { "epoch": 0.284125, "grad_norm": 2.734375, "grad_norm_var": 0.01793212890625, "learning_rate": 0.0001, "loss": 7.7733, "loss/crossentropy": 2.488365411758423, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23959456384181976, "step": 4546 }, { "epoch": 0.28425, "grad_norm": 2.84375, "grad_norm_var": 0.0169342041015625, "learning_rate": 0.0001, "loss": 7.7161, "loss/crossentropy": 2.0424224138259888, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2210150584578514, "step": 4548 }, { "epoch": 0.284375, "grad_norm": 2.609375, "grad_norm_var": 0.019294230143229167, "learning_rate": 0.0001, "loss": 7.6586, "loss/crossentropy": 2.267003059387207, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22607499361038208, "step": 4550 }, { "epoch": 0.2845, "grad_norm": 2.8125, "grad_norm_var": 0.007486979166666667, "learning_rate": 0.0001, "loss": 7.7291, "loss/crossentropy": 2.156210422515869, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24287200719118118, "step": 4552 }, { "epoch": 0.284625, "grad_norm": 2.65625, "grad_norm_var": 0.010627237955729167, "learning_rate": 0.0001, "loss": 7.6338, "loss/crossentropy": 2.12448513507843, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.248530775308609, "step": 4554 }, { "epoch": 0.28475, "grad_norm": 2.671875, "grad_norm_var": 0.010789998372395833, "learning_rate": 0.0001, "loss": 7.5894, "loss/crossentropy": 2.1755086183547974, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23503072559833527, "step": 4556 }, { "epoch": 0.284875, "grad_norm": 2.71875, "grad_norm_var": 0.012398274739583333, "learning_rate": 0.0001, "loss": 7.6944, "loss/crossentropy": 2.274489164352417, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21464046835899353, "step": 4558 }, { "epoch": 0.285, "grad_norm": 2.71875, "grad_norm_var": 0.012653605143229166, "learning_rate": 0.0001, "loss": 7.8047, "loss/crossentropy": 2.117727518081665, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2323596179485321, "step": 4560 }, { "epoch": 0.285125, "grad_norm": 2.703125, "grad_norm_var": 0.01304931640625, "learning_rate": 0.0001, "loss": 7.8816, "loss/crossentropy": 2.2078381776809692, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24788908660411835, "step": 4562 }, { "epoch": 0.28525, "grad_norm": 2.796875, "grad_norm_var": 0.0162261962890625, "learning_rate": 0.0001, "loss": 7.7233, "loss/crossentropy": 2.242835283279419, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2602119892835617, "step": 4564 }, { "epoch": 0.285375, "grad_norm": 2.90625, "grad_norm_var": 0.015892537434895833, "learning_rate": 0.0001, "loss": 7.9528, "loss/crossentropy": 2.3203724026679993, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2566442936658859, "step": 4566 }, { "epoch": 0.2855, "grad_norm": 2.671875, "grad_norm_var": 0.016380818684895833, "learning_rate": 0.0001, "loss": 7.9147, "loss/crossentropy": 2.4409515857696533, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26438571512699127, "step": 4568 }, { "epoch": 0.285625, "grad_norm": 2.8125, "grad_norm_var": 0.012962849934895833, "learning_rate": 0.0001, "loss": 7.7573, "loss/crossentropy": 2.1596044301986694, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23533150553703308, "step": 4570 }, { "epoch": 0.28575, "grad_norm": 2.546875, "grad_norm_var": 0.014969889322916667, "learning_rate": 0.0001, "loss": 7.8286, "loss/crossentropy": 2.285353899002075, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24440232664346695, "step": 4572 }, { "epoch": 0.285875, "grad_norm": 2.65625, "grad_norm_var": 0.011979166666666667, "learning_rate": 0.0001, "loss": 7.8914, "loss/crossentropy": 2.3471704721450806, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23832321166992188, "step": 4574 }, { "epoch": 0.286, "grad_norm": 2.734375, "grad_norm_var": 0.013102213541666666, "learning_rate": 0.0001, "loss": 7.6923, "loss/crossentropy": 2.1897164583206177, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23073186725378036, "step": 4576 }, { "epoch": 0.286125, "grad_norm": 2.796875, "grad_norm_var": 0.013232421875, "learning_rate": 0.0001, "loss": 7.8084, "loss/crossentropy": 2.110626220703125, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23897499591112137, "step": 4578 }, { "epoch": 0.28625, "grad_norm": 2.59375, "grad_norm_var": 0.010249837239583334, "learning_rate": 0.0001, "loss": 7.7414, "loss/crossentropy": 2.116724729537964, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2389025017619133, "step": 4580 }, { "epoch": 0.286375, "grad_norm": 2.734375, "grad_norm_var": 0.013297526041666667, "learning_rate": 0.0001, "loss": 8.103, "loss/crossentropy": 2.592491626739502, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2534056827425957, "step": 4582 }, { "epoch": 0.2865, "grad_norm": 2.5625, "grad_norm_var": 0.0154296875, "learning_rate": 0.0001, "loss": 7.9184, "loss/crossentropy": 2.255778193473816, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.25007857382297516, "step": 4584 }, { "epoch": 0.286625, "grad_norm": 2.765625, "grad_norm_var": 0.0155914306640625, "learning_rate": 0.0001, "loss": 7.934, "loss/crossentropy": 2.2685500383377075, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24690337479114532, "step": 4586 }, { "epoch": 0.28675, "grad_norm": 2.796875, "grad_norm_var": 0.05712483723958333, "learning_rate": 0.0001, "loss": 7.9887, "loss/crossentropy": 2.2190144062042236, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23835859447717667, "step": 4588 }, { "epoch": 0.286875, "grad_norm": 2.796875, "grad_norm_var": 0.06086324055989583, "learning_rate": 0.0001, "loss": 7.699, "loss/crossentropy": 2.4499796628952026, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25383134186267853, "step": 4590 }, { "epoch": 0.287, "grad_norm": 2.796875, "grad_norm_var": 0.06018880208333333, "learning_rate": 0.0001, "loss": 7.9184, "loss/crossentropy": 2.351305603981018, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.25358283519744873, "step": 4592 }, { "epoch": 0.287125, "grad_norm": 2.828125, "grad_norm_var": 0.05969645182291667, "learning_rate": 0.0001, "loss": 7.9222, "loss/crossentropy": 2.54839551448822, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2523096054792404, "step": 4594 }, { "epoch": 0.28725, "grad_norm": 2.96875, "grad_norm_var": 0.05803629557291667, "learning_rate": 0.0001, "loss": 8.0978, "loss/crossentropy": 2.3768727779388428, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.26485833525657654, "step": 4596 }, { "epoch": 0.287375, "grad_norm": 2.78125, "grad_norm_var": 0.0564361572265625, "learning_rate": 0.0001, "loss": 7.9291, "loss/crossentropy": 2.1493752002716064, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24378077685832977, "step": 4598 }, { "epoch": 0.2875, "grad_norm": 2.65625, "grad_norm_var": 0.06765034993489584, "learning_rate": 0.0001, "loss": 7.7357, "loss/crossentropy": 2.1995983123779297, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25253790616989136, "step": 4600 }, { "epoch": 0.287625, "grad_norm": 2.859375, "grad_norm_var": 0.06731363932291666, "learning_rate": 0.0001, "loss": 7.766, "loss/crossentropy": 2.3087100982666016, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24705364555120468, "step": 4602 }, { "epoch": 0.28775, "grad_norm": 2.859375, "grad_norm_var": 0.029488118489583333, "learning_rate": 0.0001, "loss": 7.9583, "loss/crossentropy": 2.489904284477234, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27738331258296967, "step": 4604 }, { "epoch": 0.287875, "grad_norm": 2.78125, "grad_norm_var": 0.02515869140625, "learning_rate": 0.0001, "loss": 7.8664, "loss/crossentropy": 2.2570077180862427, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2406313642859459, "step": 4606 }, { "epoch": 0.288, "grad_norm": 2.984375, "grad_norm_var": 0.023607381184895835, "learning_rate": 0.0001, "loss": 8.0543, "loss/crossentropy": 2.2567999362945557, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26747821271419525, "step": 4608 }, { "epoch": 0.288125, "grad_norm": 2.921875, "grad_norm_var": 0.0237701416015625, "learning_rate": 0.0001, "loss": 7.8719, "loss/crossentropy": 2.139029622077942, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24090711027383804, "step": 4610 }, { "epoch": 0.28825, "grad_norm": 2.8125, "grad_norm_var": 0.022630818684895835, "learning_rate": 0.0001, "loss": 7.8638, "loss/crossentropy": 2.338477373123169, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24437221884727478, "step": 4612 }, { "epoch": 0.288375, "grad_norm": 2.78125, "grad_norm_var": 0.022630818684895835, "learning_rate": 0.0001, "loss": 8.0812, "loss/crossentropy": 2.2592166662216187, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2249501720070839, "step": 4614 }, { "epoch": 0.2885, "grad_norm": 2.765625, "grad_norm_var": 0.0068756103515625, "learning_rate": 0.0001, "loss": 7.9806, "loss/crossentropy": 2.359493136405945, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.257377028465271, "step": 4616 }, { "epoch": 0.288625, "grad_norm": 2.78125, "grad_norm_var": 0.007201131184895833, "learning_rate": 0.0001, "loss": 7.9093, "loss/crossentropy": 2.4038420915603638, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24321341514587402, "step": 4618 }, { "epoch": 0.28875, "grad_norm": 2.71875, "grad_norm_var": 0.007591756184895834, "learning_rate": 0.0001, "loss": 7.9689, "loss/crossentropy": 2.6115297079086304, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27591899037361145, "step": 4620 }, { "epoch": 0.288875, "grad_norm": 2.578125, "grad_norm_var": 0.011344401041666667, "learning_rate": 0.0001, "loss": 7.6229, "loss/crossentropy": 2.341712713241577, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2330111712217331, "step": 4622 }, { "epoch": 0.289, "grad_norm": 2.765625, "grad_norm_var": 0.00992431640625, "learning_rate": 0.0001, "loss": 7.9412, "loss/crossentropy": 2.6497541666030884, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23105424642562866, "step": 4624 }, { "epoch": 0.289125, "grad_norm": 2.8125, "grad_norm_var": 0.009554036458333333, "learning_rate": 0.0001, "loss": 7.9315, "loss/crossentropy": 2.081304132938385, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23605384677648544, "step": 4626 }, { "epoch": 0.28925, "grad_norm": 2.984375, "grad_norm_var": 0.012498982747395833, "learning_rate": 0.0001, "loss": 8.162, "loss/crossentropy": 2.3057098388671875, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2658655047416687, "step": 4628 }, { "epoch": 0.289375, "grad_norm": 2.84375, "grad_norm_var": 0.013570149739583334, "learning_rate": 0.0001, "loss": 7.6801, "loss/crossentropy": 2.2048500776290894, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22433912009000778, "step": 4630 }, { "epoch": 0.2895, "grad_norm": 2.734375, "grad_norm_var": 0.01363525390625, "learning_rate": 0.0001, "loss": 7.8816, "loss/crossentropy": 2.3596919775009155, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2508801370859146, "step": 4632 }, { "epoch": 0.289625, "grad_norm": 2.828125, "grad_norm_var": 0.0130279541015625, "learning_rate": 0.0001, "loss": 7.8135, "loss/crossentropy": 2.373443126678467, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2330445945262909, "step": 4634 }, { "epoch": 0.28975, "grad_norm": 2.625, "grad_norm_var": 0.014574178059895833, "learning_rate": 0.0001, "loss": 7.7825, "loss/crossentropy": 2.0845218896865845, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2310279756784439, "step": 4636 }, { "epoch": 0.289875, "grad_norm": 2.71875, "grad_norm_var": 0.010033162434895833, "learning_rate": 0.0001, "loss": 7.9074, "loss/crossentropy": 2.1493619680404663, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2433633804321289, "step": 4638 }, { "epoch": 0.29, "grad_norm": 2.8125, "grad_norm_var": 0.010838826497395834, "learning_rate": 0.0001, "loss": 7.9194, "loss/crossentropy": 2.1650909185409546, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2204497754573822, "step": 4640 }, { "epoch": 0.290125, "grad_norm": 2.59375, "grad_norm_var": 0.013993326822916667, "learning_rate": 0.0001, "loss": 7.5943, "loss/crossentropy": 2.114119052886963, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23961012810468674, "step": 4642 }, { "epoch": 0.29025, "grad_norm": 2.90625, "grad_norm_var": 0.011432902018229166, "learning_rate": 0.0001, "loss": 7.9386, "loss/crossentropy": 2.418122887611389, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24907371401786804, "step": 4644 }, { "epoch": 0.290375, "grad_norm": 2.75, "grad_norm_var": 0.010282389322916667, "learning_rate": 0.0001, "loss": 7.9826, "loss/crossentropy": 2.4677258729934692, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24415672570466995, "step": 4646 }, { "epoch": 0.2905, "grad_norm": 2.625, "grad_norm_var": 0.014696248372395833, "learning_rate": 0.0001, "loss": 7.3259, "loss/crossentropy": 2.4106531143188477, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23293237388134003, "step": 4648 }, { "epoch": 0.290625, "grad_norm": 2.890625, "grad_norm_var": 0.016239420572916666, "learning_rate": 0.0001, "loss": 7.8404, "loss/crossentropy": 2.1289098262786865, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2512098699808121, "step": 4650 }, { "epoch": 0.29075, "grad_norm": 2.75, "grad_norm_var": 0.0156890869140625, "learning_rate": 0.0001, "loss": 8.0697, "loss/crossentropy": 2.4803497791290283, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25660742819309235, "step": 4652 }, { "epoch": 0.290875, "grad_norm": 2.8125, "grad_norm_var": 0.01558837890625, "learning_rate": 0.0001, "loss": 7.7538, "loss/crossentropy": 2.2673219442367554, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2544805705547333, "step": 4654 }, { "epoch": 0.291, "grad_norm": 2.78125, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 8.0011, "loss/crossentropy": 2.363333821296692, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2525113821029663, "step": 4656 }, { "epoch": 0.291125, "grad_norm": 2.796875, "grad_norm_var": 0.013472493489583333, "learning_rate": 0.0001, "loss": 7.838, "loss/crossentropy": 2.447182536125183, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.27067863941192627, "step": 4658 }, { "epoch": 0.29125, "grad_norm": 2.703125, "grad_norm_var": 0.011644490559895833, "learning_rate": 0.0001, "loss": 7.8486, "loss/crossentropy": 2.317343235015869, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23992155492305756, "step": 4660 }, { "epoch": 0.291375, "grad_norm": 2.59375, "grad_norm_var": 0.012743123372395833, "learning_rate": 0.0001, "loss": 7.7067, "loss/crossentropy": 2.2566442489624023, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22990144789218903, "step": 4662 }, { "epoch": 0.2915, "grad_norm": 3.0, "grad_norm_var": 0.0131500244140625, "learning_rate": 0.0001, "loss": 7.9825, "loss/crossentropy": 2.31771183013916, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2437821552157402, "step": 4664 }, { "epoch": 0.291625, "grad_norm": 2.6875, "grad_norm_var": 0.012040201822916667, "learning_rate": 0.0001, "loss": 7.8893, "loss/crossentropy": 2.338606595993042, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2427096962928772, "step": 4666 }, { "epoch": 0.29175, "grad_norm": 2.65625, "grad_norm_var": 0.0126373291015625, "learning_rate": 0.0001, "loss": 7.8544, "loss/crossentropy": 2.1768248081207275, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23780644685029984, "step": 4668 }, { "epoch": 0.291875, "grad_norm": 2.765625, "grad_norm_var": 0.01197509765625, "learning_rate": 0.0001, "loss": 7.8657, "loss/crossentropy": 2.3200763463974, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.25116677582263947, "step": 4670 }, { "epoch": 0.292, "grad_norm": 2.546875, "grad_norm_var": 0.01217041015625, "learning_rate": 0.0001, "loss": 7.893, "loss/crossentropy": 2.3021084666252136, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.25273073464632034, "step": 4672 }, { "epoch": 0.292125, "grad_norm": 2.84375, "grad_norm_var": 0.013874308268229166, "learning_rate": 0.0001, "loss": 7.5829, "loss/crossentropy": 2.344818592071533, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23609033972024918, "step": 4674 }, { "epoch": 0.29225, "grad_norm": 2.6875, "grad_norm_var": 0.014525349934895833, "learning_rate": 0.0001, "loss": 7.9918, "loss/crossentropy": 2.4708172082901, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2394529953598976, "step": 4676 }, { "epoch": 0.292375, "grad_norm": 2.78125, "grad_norm_var": 0.013623046875, "learning_rate": 0.0001, "loss": 7.8503, "loss/crossentropy": 2.26045298576355, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2537362724542618, "step": 4678 }, { "epoch": 0.2925, "grad_norm": 2.59375, "grad_norm_var": 0.0128326416015625, "learning_rate": 0.0001, "loss": 7.9592, "loss/crossentropy": 2.4197391271591187, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.261072501540184, "step": 4680 }, { "epoch": 0.292625, "grad_norm": 2.609375, "grad_norm_var": 0.0155426025390625, "learning_rate": 0.0001, "loss": 7.669, "loss/crossentropy": 2.1115235090255737, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2209169566631317, "step": 4682 }, { "epoch": 0.29275, "grad_norm": 2.875, "grad_norm_var": 0.016796875, "learning_rate": 0.0001, "loss": 7.8656, "loss/crossentropy": 2.2382802963256836, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26023590564727783, "step": 4684 }, { "epoch": 0.292875, "grad_norm": 2.515625, "grad_norm_var": 0.019554646809895833, "learning_rate": 0.0001, "loss": 7.6088, "loss/crossentropy": 2.3211545944213867, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2478296160697937, "step": 4686 }, { "epoch": 0.293, "grad_norm": 3.296875, "grad_norm_var": 0.04029032389322917, "learning_rate": 0.0001, "loss": 8.0111, "loss/crossentropy": 2.313927173614502, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2640543207526207, "step": 4688 }, { "epoch": 0.293125, "grad_norm": 2.78125, "grad_norm_var": 0.03772379557291667, "learning_rate": 0.0001, "loss": 7.7393, "loss/crossentropy": 2.3064017295837402, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2437521070241928, "step": 4690 }, { "epoch": 0.29325, "grad_norm": 2.609375, "grad_norm_var": 0.0393951416015625, "learning_rate": 0.0001, "loss": 7.8766, "loss/crossentropy": 2.024076998233795, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23882752656936646, "step": 4692 }, { "epoch": 0.293375, "grad_norm": 2.8125, "grad_norm_var": 0.038895670572916666, "learning_rate": 0.0001, "loss": 7.8602, "loss/crossentropy": 2.1963272094726562, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24665944278240204, "step": 4694 }, { "epoch": 0.2935, "grad_norm": 2.765625, "grad_norm_var": 0.03626302083333333, "learning_rate": 0.0001, "loss": 8.1401, "loss/crossentropy": 2.492723226547241, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26501891016960144, "step": 4696 }, { "epoch": 0.293625, "grad_norm": 2.765625, "grad_norm_var": 0.0286285400390625, "learning_rate": 0.0001, "loss": 7.7462, "loss/crossentropy": 2.2765644788742065, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2406775951385498, "step": 4698 }, { "epoch": 0.29375, "grad_norm": 2.9375, "grad_norm_var": 0.02955322265625, "learning_rate": 0.0001, "loss": 7.7331, "loss/crossentropy": 2.316620707511902, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2521413713693619, "step": 4700 }, { "epoch": 0.293875, "grad_norm": 2.75, "grad_norm_var": 0.0240875244140625, "learning_rate": 0.0001, "loss": 7.779, "loss/crossentropy": 2.4202111959457397, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2336704283952713, "step": 4702 }, { "epoch": 0.294, "grad_norm": 2.890625, "grad_norm_var": 0.008707682291666666, "learning_rate": 0.0001, "loss": 7.8674, "loss/crossentropy": 2.3005402088165283, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23975594341754913, "step": 4704 }, { "epoch": 0.294125, "grad_norm": 2.78125, "grad_norm_var": 0.00836181640625, "learning_rate": 0.0001, "loss": 7.844, "loss/crossentropy": 2.356071710586548, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25094401836395264, "step": 4706 }, { "epoch": 0.29425, "grad_norm": 2.609375, "grad_norm_var": 0.0083160400390625, "learning_rate": 0.0001, "loss": 7.6412, "loss/crossentropy": 2.164121150970459, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22354429215192795, "step": 4708 }, { "epoch": 0.294375, "grad_norm": 3.0, "grad_norm_var": 0.013386027018229166, "learning_rate": 0.0001, "loss": 7.6271, "loss/crossentropy": 2.214874505996704, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23576121032238007, "step": 4710 }, { "epoch": 0.2945, "grad_norm": 2.90625, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 7.7613, "loss/crossentropy": 2.269544243812561, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24873965978622437, "step": 4712 }, { "epoch": 0.294625, "grad_norm": 2.78125, "grad_norm_var": 0.015071614583333334, "learning_rate": 0.0001, "loss": 7.7393, "loss/crossentropy": 2.1202977895736694, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2352263107895851, "step": 4714 }, { "epoch": 0.29475, "grad_norm": 2.703125, "grad_norm_var": 0.013166300455729167, "learning_rate": 0.0001, "loss": 7.8775, "loss/crossentropy": 2.094593346118927, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22887995839118958, "step": 4716 }, { "epoch": 0.294875, "grad_norm": 2.640625, "grad_norm_var": 0.0155670166015625, "learning_rate": 0.0001, "loss": 7.9644, "loss/crossentropy": 2.484963059425354, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2369099259376526, "step": 4718 }, { "epoch": 0.295, "grad_norm": 2.75, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 7.7284, "loss/crossentropy": 2.4684488773345947, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.252421073615551, "step": 4720 }, { "epoch": 0.295125, "grad_norm": 2.734375, "grad_norm_var": 0.017215983072916666, "learning_rate": 0.0001, "loss": 7.6495, "loss/crossentropy": 2.206870675086975, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2408876121044159, "step": 4722 }, { "epoch": 0.29525, "grad_norm": 2.578125, "grad_norm_var": 0.018680826822916666, "learning_rate": 0.0001, "loss": 7.6522, "loss/crossentropy": 2.2269601821899414, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2338848039507866, "step": 4724 }, { "epoch": 0.295375, "grad_norm": 2.609375, "grad_norm_var": 0.014484659830729166, "learning_rate": 0.0001, "loss": 7.6975, "loss/crossentropy": 2.315587639808655, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2363230139017105, "step": 4726 }, { "epoch": 0.2955, "grad_norm": 3.15625, "grad_norm_var": 0.0728424072265625, "learning_rate": 0.0001, "loss": 8.1819, "loss/crossentropy": 2.4364620447158813, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2659152150154114, "step": 4728 }, { "epoch": 0.295625, "grad_norm": 2.640625, "grad_norm_var": 0.0735992431640625, "learning_rate": 0.0001, "loss": 7.7693, "loss/crossentropy": 2.211803436279297, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2437971532344818, "step": 4730 }, { "epoch": 0.29575, "grad_norm": 2.75, "grad_norm_var": 0.07376302083333333, "learning_rate": 0.0001, "loss": 8.0168, "loss/crossentropy": 2.456194519996643, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2696246802806854, "step": 4732 }, { "epoch": 0.295875, "grad_norm": 3.03125, "grad_norm_var": 0.077490234375, "learning_rate": 0.0001, "loss": 8.0845, "loss/crossentropy": 2.474150776863098, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2648543119430542, "step": 4734 }, { "epoch": 0.296, "grad_norm": 2.703125, "grad_norm_var": 0.076806640625, "learning_rate": 0.0001, "loss": 7.8236, "loss/crossentropy": 2.1477906703948975, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.21783293038606644, "step": 4736 }, { "epoch": 0.296125, "grad_norm": 2.59375, "grad_norm_var": 0.07971598307291666, "learning_rate": 0.0001, "loss": 7.7776, "loss/crossentropy": 2.420551061630249, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24374763667583466, "step": 4738 }, { "epoch": 0.29625, "grad_norm": 2.875, "grad_norm_var": 0.0744781494140625, "learning_rate": 0.0001, "loss": 7.8463, "loss/crossentropy": 2.1625437140464783, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2414453774690628, "step": 4740 }, { "epoch": 0.296375, "grad_norm": 2.625, "grad_norm_var": 0.07405598958333333, "learning_rate": 0.0001, "loss": 7.6572, "loss/crossentropy": 2.3618721961975098, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22302739322185516, "step": 4742 }, { "epoch": 0.2965, "grad_norm": 2.734375, "grad_norm_var": 0.019677734375, "learning_rate": 0.0001, "loss": 7.7039, "loss/crossentropy": 2.276962637901306, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2348506674170494, "step": 4744 }, { "epoch": 0.296625, "grad_norm": 2.984375, "grad_norm_var": 0.0227447509765625, "learning_rate": 0.0001, "loss": 7.7101, "loss/crossentropy": 2.2608531713485718, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23264042288064957, "step": 4746 }, { "epoch": 0.29675, "grad_norm": 3.0, "grad_norm_var": 0.026106770833333334, "learning_rate": 0.0001, "loss": 7.7291, "loss/crossentropy": 2.259305477142334, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24493787437677383, "step": 4748 }, { "epoch": 0.296875, "grad_norm": 2.578125, "grad_norm_var": 0.020726521809895832, "learning_rate": 0.0001, "loss": 7.8278, "loss/crossentropy": 2.1236448287963867, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2166353240609169, "step": 4750 }, { "epoch": 0.297, "grad_norm": 2.734375, "grad_norm_var": 0.020067342122395835, "learning_rate": 0.0001, "loss": 8.0115, "loss/crossentropy": 2.401174783706665, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25013113021850586, "step": 4752 }, { "epoch": 0.297125, "grad_norm": 3.078125, "grad_norm_var": 0.023128255208333334, "learning_rate": 0.0001, "loss": 7.7745, "loss/crossentropy": 2.274779200553894, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2390403375029564, "step": 4754 }, { "epoch": 0.29725, "grad_norm": 2.8125, "grad_norm_var": 0.022761027018229168, "learning_rate": 0.0001, "loss": 7.8827, "loss/crossentropy": 2.459893226623535, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2380296215415001, "step": 4756 }, { "epoch": 0.297375, "grad_norm": 2.78125, "grad_norm_var": 0.018871053059895834, "learning_rate": 0.0001, "loss": 7.9383, "loss/crossentropy": 2.3783739805221558, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24623656272888184, "step": 4758 }, { "epoch": 0.2975, "grad_norm": 2.640625, "grad_norm_var": 0.027665201822916666, "learning_rate": 0.0001, "loss": 7.8763, "loss/crossentropy": 2.168853282928467, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24729091674089432, "step": 4760 }, { "epoch": 0.297625, "grad_norm": 2.90625, "grad_norm_var": 0.024437459309895833, "learning_rate": 0.0001, "loss": 7.7638, "loss/crossentropy": 2.251927137374878, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23072123527526855, "step": 4762 }, { "epoch": 0.29775, "grad_norm": 2.625, "grad_norm_var": 0.024576822916666668, "learning_rate": 0.0001, "loss": 7.9186, "loss/crossentropy": 2.4425971508026123, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.26446957886219025, "step": 4764 }, { "epoch": 0.297875, "grad_norm": 2.6875, "grad_norm_var": 0.022248331705729166, "learning_rate": 0.0001, "loss": 8.077, "loss/crossentropy": 2.4315247535705566, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2588339149951935, "step": 4766 }, { "epoch": 0.298, "grad_norm": 2.59375, "grad_norm_var": 0.025520833333333333, "learning_rate": 0.0001, "loss": 7.7574, "loss/crossentropy": 2.2992852926254272, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23762428015470505, "step": 4768 }, { "epoch": 0.298125, "grad_norm": 2.625, "grad_norm_var": 0.020849609375, "learning_rate": 0.0001, "loss": 7.8434, "loss/crossentropy": 2.043939173221588, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.21513615548610687, "step": 4770 }, { "epoch": 0.29825, "grad_norm": 2.734375, "grad_norm_var": 0.020954386393229166, "learning_rate": 0.0001, "loss": 7.8304, "loss/crossentropy": 2.2986350059509277, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2495313584804535, "step": 4772 }, { "epoch": 0.298375, "grad_norm": 2.859375, "grad_norm_var": 0.0213531494140625, "learning_rate": 0.0001, "loss": 7.9589, "loss/crossentropy": 2.339053153991699, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24236908555030823, "step": 4774 }, { "epoch": 0.2985, "grad_norm": 2.78125, "grad_norm_var": 0.013818359375, "learning_rate": 0.0001, "loss": 7.7286, "loss/crossentropy": 2.0957180857658386, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24779074639081955, "step": 4776 }, { "epoch": 0.298625, "grad_norm": 2.78125, "grad_norm_var": 0.012141927083333334, "learning_rate": 0.0001, "loss": 8.233, "loss/crossentropy": 2.9842677116394043, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2718924880027771, "step": 4778 }, { "epoch": 0.29875, "grad_norm": 2.53125, "grad_norm_var": 0.0136138916015625, "learning_rate": 0.0001, "loss": 7.4854, "loss/crossentropy": 2.190479040145874, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24591752886772156, "step": 4780 }, { "epoch": 0.298875, "grad_norm": 2.828125, "grad_norm_var": 0.015458170572916667, "learning_rate": 0.0001, "loss": 7.7876, "loss/crossentropy": 2.197520136833191, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2530493885278702, "step": 4782 }, { "epoch": 0.299, "grad_norm": 2.6875, "grad_norm_var": 0.018033854166666665, "learning_rate": 0.0001, "loss": 7.8095, "loss/crossentropy": 2.218672752380371, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27455490082502365, "step": 4784 }, { "epoch": 0.299125, "grad_norm": 2.671875, "grad_norm_var": 0.015751139322916666, "learning_rate": 0.0001, "loss": 7.7287, "loss/crossentropy": 2.2696913480758667, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2461843192577362, "step": 4786 }, { "epoch": 0.29925, "grad_norm": 2.875, "grad_norm_var": 0.01666259765625, "learning_rate": 0.0001, "loss": 7.6669, "loss/crossentropy": 2.3570048809051514, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22745678573846817, "step": 4788 }, { "epoch": 0.299375, "grad_norm": 2.734375, "grad_norm_var": 0.017145792643229168, "learning_rate": 0.0001, "loss": 7.5036, "loss/crossentropy": 2.24556565284729, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23811353743076324, "step": 4790 }, { "epoch": 0.2995, "grad_norm": 2.953125, "grad_norm_var": 0.02291259765625, "learning_rate": 0.0001, "loss": 8.0203, "loss/crossentropy": 2.309758424758911, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25146184861660004, "step": 4792 }, { "epoch": 0.299625, "grad_norm": 2.6875, "grad_norm_var": 0.025358072916666665, "learning_rate": 0.0001, "loss": 7.6282, "loss/crossentropy": 2.077813982963562, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24285054206848145, "step": 4794 }, { "epoch": 0.29975, "grad_norm": 2.515625, "grad_norm_var": 0.0229156494140625, "learning_rate": 0.0001, "loss": 7.9302, "loss/crossentropy": 2.1969715356826782, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2558520808815956, "step": 4796 }, { "epoch": 0.299875, "grad_norm": 2.765625, "grad_norm_var": 0.022880045572916667, "learning_rate": 0.0001, "loss": 7.8368, "loss/crossentropy": 2.3381584882736206, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24594520032405853, "step": 4798 }, { "epoch": 0.3, "grad_norm": 2.78125, "grad_norm_var": 0.025516764322916666, "learning_rate": 0.0001, "loss": 7.966, "loss/crossentropy": 2.3393973112106323, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2429567277431488, "step": 4800 }, { "epoch": 0.300125, "grad_norm": 3.15625, "grad_norm_var": 0.033426920572916664, "learning_rate": 0.0001, "loss": 7.7758, "loss/crossentropy": 2.355650305747986, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2515396624803543, "step": 4802 }, { "epoch": 0.30025, "grad_norm": 2.53125, "grad_norm_var": 0.038895670572916666, "learning_rate": 0.0001, "loss": 7.5675, "loss/crossentropy": 2.295384407043457, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24032189697027206, "step": 4804 }, { "epoch": 0.300375, "grad_norm": 2.875, "grad_norm_var": 0.03721415201822917, "learning_rate": 0.0001, "loss": 7.859, "loss/crossentropy": 2.2477974891662598, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2460411638021469, "step": 4806 }, { "epoch": 0.3005, "grad_norm": 3.015625, "grad_norm_var": 0.03443603515625, "learning_rate": 0.0001, "loss": 7.9543, "loss/crossentropy": 2.465542197227478, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27130480110645294, "step": 4808 }, { "epoch": 0.300625, "grad_norm": 2.890625, "grad_norm_var": 0.03127848307291667, "learning_rate": 0.0001, "loss": 7.9928, "loss/crossentropy": 2.4237444400787354, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.27084454894065857, "step": 4810 }, { "epoch": 0.30075, "grad_norm": 3.109375, "grad_norm_var": 0.030256144205729165, "learning_rate": 0.0001, "loss": 7.8769, "loss/crossentropy": 2.2092403173446655, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25872690975666046, "step": 4812 }, { "epoch": 0.300875, "grad_norm": 2.796875, "grad_norm_var": 0.030045572916666666, "learning_rate": 0.0001, "loss": 7.6101, "loss/crossentropy": 2.1110405921936035, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25246672332286835, "step": 4814 }, { "epoch": 0.301, "grad_norm": 2.828125, "grad_norm_var": 0.027897135416666666, "learning_rate": 0.0001, "loss": 7.7307, "loss/crossentropy": 2.1861435770988464, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24098865687847137, "step": 4816 }, { "epoch": 0.301125, "grad_norm": 3.0625, "grad_norm_var": 0.030671183268229166, "learning_rate": 0.0001, "loss": 8.0081, "loss/crossentropy": 2.353633165359497, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25807127356529236, "step": 4818 }, { "epoch": 0.30125, "grad_norm": 2.921875, "grad_norm_var": 0.0243072509765625, "learning_rate": 0.0001, "loss": 7.9272, "loss/crossentropy": 2.309553384780884, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24087034165859222, "step": 4820 }, { "epoch": 0.301375, "grad_norm": 2.6875, "grad_norm_var": 0.0261383056640625, "learning_rate": 0.0001, "loss": 7.7121, "loss/crossentropy": 2.2967405319213867, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24296566098928452, "step": 4822 }, { "epoch": 0.3015, "grad_norm": 3.0, "grad_norm_var": 0.025419108072916665, "learning_rate": 0.0001, "loss": 7.7998, "loss/crossentropy": 2.126443088054657, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24496936798095703, "step": 4824 }, { "epoch": 0.301625, "grad_norm": 2.71875, "grad_norm_var": 0.027567545572916668, "learning_rate": 0.0001, "loss": 7.6624, "loss/crossentropy": 2.077734351158142, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2187972590327263, "step": 4826 }, { "epoch": 0.30175, "grad_norm": 2.671875, "grad_norm_var": 0.024689737955729166, "learning_rate": 0.0001, "loss": 7.8426, "loss/crossentropy": 2.2834479808807373, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2420749068260193, "step": 4828 }, { "epoch": 0.301875, "grad_norm": 2.625, "grad_norm_var": 0.02603759765625, "learning_rate": 0.0001, "loss": 7.8154, "loss/crossentropy": 2.260331392288208, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2873457819223404, "step": 4830 }, { "epoch": 0.302, "grad_norm": 2.703125, "grad_norm_var": 0.0253082275390625, "learning_rate": 0.0001, "loss": 7.7021, "loss/crossentropy": 2.262237071990967, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23278243839740753, "step": 4832 }, { "epoch": 0.302125, "grad_norm": 2.484375, "grad_norm_var": 0.0204254150390625, "learning_rate": 0.0001, "loss": 7.6214, "loss/crossentropy": 2.257599115371704, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23131565004587173, "step": 4834 }, { "epoch": 0.30225, "grad_norm": 2.78125, "grad_norm_var": 0.01783447265625, "learning_rate": 0.0001, "loss": 8.0001, "loss/crossentropy": 2.1568833589553833, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2344806119799614, "step": 4836 }, { "epoch": 0.302375, "grad_norm": 2.859375, "grad_norm_var": 0.019514973958333334, "learning_rate": 0.0001, "loss": 8.096, "loss/crossentropy": 2.5879868268966675, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2531944811344147, "step": 4838 }, { "epoch": 0.3025, "grad_norm": 2.5625, "grad_norm_var": 0.016047159830729168, "learning_rate": 0.0001, "loss": 7.8222, "loss/crossentropy": 2.415086269378662, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2404794991016388, "step": 4840 }, { "epoch": 0.302625, "grad_norm": 2.875, "grad_norm_var": 0.013570149739583334, "learning_rate": 0.0001, "loss": 7.8946, "loss/crossentropy": 2.472349166870117, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.27219946682453156, "step": 4842 }, { "epoch": 0.30275, "grad_norm": 3.375, "grad_norm_var": 0.040913899739583336, "learning_rate": 0.0001, "loss": 7.9051, "loss/crossentropy": 2.432919979095459, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23057591915130615, "step": 4844 }, { "epoch": 0.302875, "grad_norm": 2.71875, "grad_norm_var": 0.040160115559895834, "learning_rate": 0.0001, "loss": 8.0401, "loss/crossentropy": 2.359107494354248, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.250319704413414, "step": 4846 }, { "epoch": 0.303, "grad_norm": 2.953125, "grad_norm_var": 0.0420318603515625, "learning_rate": 0.0001, "loss": 7.7646, "loss/crossentropy": 2.4273535013198853, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2483212798833847, "step": 4848 }, { "epoch": 0.303125, "grad_norm": 2.75, "grad_norm_var": 0.03319905598958333, "learning_rate": 0.0001, "loss": 7.7906, "loss/crossentropy": 2.0931962728500366, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.22388184815645218, "step": 4850 }, { "epoch": 0.30325, "grad_norm": 2.6875, "grad_norm_var": 0.03367411295572917, "learning_rate": 0.0001, "loss": 7.9, "loss/crossentropy": 2.418039083480835, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26220010221004486, "step": 4852 }, { "epoch": 0.303375, "grad_norm": 2.703125, "grad_norm_var": 0.03406575520833333, "learning_rate": 0.0001, "loss": 7.6481, "loss/crossentropy": 2.1874221563339233, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2431078925728798, "step": 4854 }, { "epoch": 0.3035, "grad_norm": 2.671875, "grad_norm_var": 0.033610026041666664, "learning_rate": 0.0001, "loss": 7.5331, "loss/crossentropy": 2.4446771144866943, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2340606153011322, "step": 4856 }, { "epoch": 0.303625, "grad_norm": 2.875, "grad_norm_var": 0.030615234375, "learning_rate": 0.0001, "loss": 7.9552, "loss/crossentropy": 2.355180263519287, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.25452350825071335, "step": 4858 }, { "epoch": 0.30375, "grad_norm": 2.828125, "grad_norm_var": 0.008968098958333334, "learning_rate": 0.0001, "loss": 7.7248, "loss/crossentropy": 2.2880266904830933, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23547657579183578, "step": 4860 }, { "epoch": 0.303875, "grad_norm": 2.71875, "grad_norm_var": 0.008817545572916667, "learning_rate": 0.0001, "loss": 7.9057, "loss/crossentropy": 2.0984508991241455, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23664943128824234, "step": 4862 }, { "epoch": 0.304, "grad_norm": 2.8125, "grad_norm_var": 0.0067860921223958336, "learning_rate": 0.0001, "loss": 7.8171, "loss/crossentropy": 2.265193462371826, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2543717473745346, "step": 4864 }, { "epoch": 0.304125, "grad_norm": 3.125, "grad_norm_var": 0.015306599934895833, "learning_rate": 0.0001, "loss": 7.9105, "loss/crossentropy": 2.4163901805877686, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.30484558641910553, "step": 4866 }, { "epoch": 0.30425, "grad_norm": 2.734375, "grad_norm_var": 0.015770467122395833, "learning_rate": 0.0001, "loss": 7.6951, "loss/crossentropy": 2.2933754920959473, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2480909451842308, "step": 4868 }, { "epoch": 0.304375, "grad_norm": 2.65625, "grad_norm_var": 0.019596354166666666, "learning_rate": 0.0001, "loss": 7.8028, "loss/crossentropy": 2.1020551919937134, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24488769471645355, "step": 4870 }, { "epoch": 0.3045, "grad_norm": 2.96875, "grad_norm_var": 0.017219034830729167, "learning_rate": 0.0001, "loss": 7.7937, "loss/crossentropy": 2.3514881134033203, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2567467764019966, "step": 4872 }, { "epoch": 0.304625, "grad_norm": 2.875, "grad_norm_var": 0.019189453125, "learning_rate": 0.0001, "loss": 7.5149, "loss/crossentropy": 2.3018182516098022, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2473335713148117, "step": 4874 }, { "epoch": 0.30475, "grad_norm": 2.640625, "grad_norm_var": 0.02027587890625, "learning_rate": 0.0001, "loss": 7.666, "loss/crossentropy": 2.4247965812683105, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2394721657037735, "step": 4876 }, { "epoch": 0.304875, "grad_norm": 2.671875, "grad_norm_var": 0.024462890625, "learning_rate": 0.0001, "loss": 7.6745, "loss/crossentropy": 2.0793182849884033, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23385987430810928, "step": 4878 }, { "epoch": 0.305, "grad_norm": 2.65625, "grad_norm_var": 0.025419108072916665, "learning_rate": 0.0001, "loss": 7.8525, "loss/crossentropy": 2.2909252643585205, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23921141028404236, "step": 4880 }, { "epoch": 0.305125, "grad_norm": 2.765625, "grad_norm_var": 0.016825358072916668, "learning_rate": 0.0001, "loss": 7.765, "loss/crossentropy": 2.396257758140564, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25963690876960754, "step": 4882 }, { "epoch": 0.30525, "grad_norm": 2.484375, "grad_norm_var": 0.020654296875, "learning_rate": 0.0001, "loss": 7.6547, "loss/crossentropy": 2.212783694267273, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22965668141841888, "step": 4884 }, { "epoch": 0.305375, "grad_norm": 2.453125, "grad_norm_var": 0.018961588541666668, "learning_rate": 0.0001, "loss": 7.6935, "loss/crossentropy": 2.0556681156158447, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23647285997867584, "step": 4886 }, { "epoch": 0.3055, "grad_norm": 2.921875, "grad_norm_var": 0.0237945556640625, "learning_rate": 0.0001, "loss": 8.0814, "loss/crossentropy": 2.607710838317871, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24981997162103653, "step": 4888 }, { "epoch": 0.305625, "grad_norm": 2.625, "grad_norm_var": 0.021996053059895833, "learning_rate": 0.0001, "loss": 7.5949, "loss/crossentropy": 2.1979677081108093, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22433170676231384, "step": 4890 }, { "epoch": 0.30575, "grad_norm": 2.765625, "grad_norm_var": 0.0276519775390625, "learning_rate": 0.0001, "loss": 7.8266, "loss/crossentropy": 2.137311816215515, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23312117159366608, "step": 4892 }, { "epoch": 0.305875, "grad_norm": 2.734375, "grad_norm_var": 0.025712076822916666, "learning_rate": 0.0001, "loss": 7.6331, "loss/crossentropy": 2.174046754837036, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22642475366592407, "step": 4894 }, { "epoch": 0.306, "grad_norm": 2.53125, "grad_norm_var": 0.028343709309895833, "learning_rate": 0.0001, "loss": 7.7241, "loss/crossentropy": 2.456217646598816, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2483394891023636, "step": 4896 }, { "epoch": 0.306125, "grad_norm": 3.109375, "grad_norm_var": 0.036844889322916664, "learning_rate": 0.0001, "loss": 7.9141, "loss/crossentropy": 2.6106754541397095, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24365855008363724, "step": 4898 }, { "epoch": 0.30625, "grad_norm": 2.609375, "grad_norm_var": 0.03371988932291667, "learning_rate": 0.0001, "loss": 7.7919, "loss/crossentropy": 2.322625160217285, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24193479120731354, "step": 4900 }, { "epoch": 0.306375, "grad_norm": 2.96875, "grad_norm_var": 0.027945963541666667, "learning_rate": 0.0001, "loss": 7.8711, "loss/crossentropy": 2.1693862676620483, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25908514857292175, "step": 4902 }, { "epoch": 0.3065, "grad_norm": 2.578125, "grad_norm_var": 0.0261871337890625, "learning_rate": 0.0001, "loss": 7.6238, "loss/crossentropy": 2.316340684890747, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22458643466234207, "step": 4904 }, { "epoch": 0.306625, "grad_norm": 2.71875, "grad_norm_var": 0.024787394205729167, "learning_rate": 0.0001, "loss": 7.9585, "loss/crossentropy": 2.375882387161255, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25515682995319366, "step": 4906 }, { "epoch": 0.30675, "grad_norm": 2.65625, "grad_norm_var": 0.022215779622395834, "learning_rate": 0.0001, "loss": 7.5929, "loss/crossentropy": 2.0988820791244507, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.20962446182966232, "step": 4908 }, { "epoch": 0.306875, "grad_norm": 2.703125, "grad_norm_var": 0.022687784830729165, "learning_rate": 0.0001, "loss": 7.8819, "loss/crossentropy": 2.2994900941848755, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23750250041484833, "step": 4910 }, { "epoch": 0.307, "grad_norm": 3.015625, "grad_norm_var": 0.022932942708333334, "learning_rate": 0.0001, "loss": 7.9912, "loss/crossentropy": 2.3102420568466187, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24828917533159256, "step": 4912 }, { "epoch": 0.307125, "grad_norm": 2.8125, "grad_norm_var": 0.015526326497395833, "learning_rate": 0.0001, "loss": 7.8615, "loss/crossentropy": 2.5603725910186768, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.26187654584646225, "step": 4914 }, { "epoch": 0.30725, "grad_norm": 2.59375, "grad_norm_var": 0.015218098958333334, "learning_rate": 0.0001, "loss": 7.5902, "loss/crossentropy": 2.3989635705947876, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25692370533943176, "step": 4916 }, { "epoch": 0.307375, "grad_norm": 2.546875, "grad_norm_var": 0.014085896809895833, "learning_rate": 0.0001, "loss": 7.4784, "loss/crossentropy": 2.176905870437622, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23352482914924622, "step": 4918 }, { "epoch": 0.3075, "grad_norm": 2.890625, "grad_norm_var": 0.014762369791666667, "learning_rate": 0.0001, "loss": 7.787, "loss/crossentropy": 2.321637988090515, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23889657855033875, "step": 4920 }, { "epoch": 0.307625, "grad_norm": 2.640625, "grad_norm_var": 0.020621744791666667, "learning_rate": 0.0001, "loss": 7.8236, "loss/crossentropy": 2.117127299308777, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23712149262428284, "step": 4922 }, { "epoch": 0.30775, "grad_norm": 2.84375, "grad_norm_var": 0.022126261393229166, "learning_rate": 0.0001, "loss": 7.775, "loss/crossentropy": 2.205019950866699, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2316625490784645, "step": 4924 }, { "epoch": 0.307875, "grad_norm": 2.71875, "grad_norm_var": 0.02242431640625, "learning_rate": 0.0001, "loss": 7.7984, "loss/crossentropy": 2.2750273942947388, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2512553781270981, "step": 4926 }, { "epoch": 0.308, "grad_norm": 2.859375, "grad_norm_var": 0.021581013997395832, "learning_rate": 0.0001, "loss": 7.9773, "loss/crossentropy": 2.3810060024261475, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23797668516635895, "step": 4928 }, { "epoch": 0.308125, "grad_norm": 2.515625, "grad_norm_var": 0.024560546875, "learning_rate": 0.0001, "loss": 7.5906, "loss/crossentropy": 2.1816707849502563, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2099239081144333, "step": 4930 }, { "epoch": 0.30825, "grad_norm": 2.703125, "grad_norm_var": 0.02373046875, "learning_rate": 0.0001, "loss": 7.5918, "loss/crossentropy": 2.2886866331100464, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24306583404541016, "step": 4932 }, { "epoch": 0.308375, "grad_norm": 2.6875, "grad_norm_var": 0.023291015625, "learning_rate": 0.0001, "loss": 7.5391, "loss/crossentropy": 2.120474934577942, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22467254102230072, "step": 4934 }, { "epoch": 0.3085, "grad_norm": 2.6875, "grad_norm_var": 0.022386678059895835, "learning_rate": 0.0001, "loss": 7.7946, "loss/crossentropy": 2.317689895629883, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23832610249519348, "step": 4936 }, { "epoch": 0.308625, "grad_norm": 2.8125, "grad_norm_var": 0.0185943603515625, "learning_rate": 0.0001, "loss": 7.6352, "loss/crossentropy": 2.2538540363311768, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23533914238214493, "step": 4938 }, { "epoch": 0.30875, "grad_norm": 2.609375, "grad_norm_var": 0.0166168212890625, "learning_rate": 0.0001, "loss": 7.6378, "loss/crossentropy": 2.2933260202407837, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25203002244234085, "step": 4940 }, { "epoch": 0.308875, "grad_norm": 3.015625, "grad_norm_var": 0.025145467122395834, "learning_rate": 0.0001, "loss": 7.9084, "loss/crossentropy": 2.4748085737228394, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2587004452943802, "step": 4942 }, { "epoch": 0.309, "grad_norm": 2.703125, "grad_norm_var": 0.018745930989583333, "learning_rate": 0.0001, "loss": 7.5399, "loss/crossentropy": 2.0790669322013855, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2288169413805008, "step": 4944 }, { "epoch": 0.309125, "grad_norm": 2.8125, "grad_norm_var": 0.018387858072916666, "learning_rate": 0.0001, "loss": 7.7586, "loss/crossentropy": 2.2702629566192627, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2488533854484558, "step": 4946 }, { "epoch": 0.30925, "grad_norm": 2.90625, "grad_norm_var": 0.025028483072916666, "learning_rate": 0.0001, "loss": 7.9645, "loss/crossentropy": 2.5230122804641724, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26385150849819183, "step": 4948 }, { "epoch": 0.309375, "grad_norm": 2.8125, "grad_norm_var": 0.022777303059895834, "learning_rate": 0.0001, "loss": 7.7661, "loss/crossentropy": 2.141109585762024, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24200350791215897, "step": 4950 }, { "epoch": 0.3095, "grad_norm": 3.078125, "grad_norm_var": 0.027762858072916667, "learning_rate": 0.0001, "loss": 7.7392, "loss/crossentropy": 2.2927032709121704, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2502191886305809, "step": 4952 }, { "epoch": 0.309625, "grad_norm": 2.75, "grad_norm_var": 0.022395833333333334, "learning_rate": 0.0001, "loss": 7.7376, "loss/crossentropy": 2.288534641265869, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24027501046657562, "step": 4954 }, { "epoch": 0.30975, "grad_norm": 2.546875, "grad_norm_var": 0.02271728515625, "learning_rate": 0.0001, "loss": 7.7584, "loss/crossentropy": 2.1701200008392334, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23408719897270203, "step": 4956 }, { "epoch": 0.309875, "grad_norm": 2.703125, "grad_norm_var": 0.019331868489583334, "learning_rate": 0.0001, "loss": 7.8465, "loss/crossentropy": 2.322484612464905, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2424558699131012, "step": 4958 }, { "epoch": 0.31, "grad_norm": 2.921875, "grad_norm_var": 0.0214019775390625, "learning_rate": 0.0001, "loss": 7.8282, "loss/crossentropy": 2.407430052757263, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25119274854660034, "step": 4960 }, { "epoch": 0.310125, "grad_norm": 2.9375, "grad_norm_var": 0.0226226806640625, "learning_rate": 0.0001, "loss": 7.6702, "loss/crossentropy": 2.178459644317627, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.26392340660095215, "step": 4962 }, { "epoch": 0.31025, "grad_norm": 2.65625, "grad_norm_var": 0.021354166666666667, "learning_rate": 0.0001, "loss": 7.8627, "loss/crossentropy": 2.4748719930648804, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24051623791456223, "step": 4964 }, { "epoch": 0.310375, "grad_norm": 2.84375, "grad_norm_var": 0.021272786458333335, "learning_rate": 0.0001, "loss": 7.7981, "loss/crossentropy": 2.2886499762535095, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23080138117074966, "step": 4966 }, { "epoch": 0.3105, "grad_norm": 2.78125, "grad_norm_var": 0.015363566080729167, "learning_rate": 0.0001, "loss": 7.8879, "loss/crossentropy": 2.381661534309387, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24942730367183685, "step": 4968 }, { "epoch": 0.310625, "grad_norm": 2.796875, "grad_norm_var": 0.0171783447265625, "learning_rate": 0.0001, "loss": 7.7507, "loss/crossentropy": 2.438284397125244, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24986668676137924, "step": 4970 }, { "epoch": 0.31075, "grad_norm": 2.8125, "grad_norm_var": 0.013444010416666667, "learning_rate": 0.0001, "loss": 7.905, "loss/crossentropy": 2.286139726638794, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24459806829690933, "step": 4972 }, { "epoch": 0.310875, "grad_norm": 2.9375, "grad_norm_var": 0.015250651041666667, "learning_rate": 0.0001, "loss": 7.8327, "loss/crossentropy": 2.196013927459717, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23776336759328842, "step": 4974 }, { "epoch": 0.311, "grad_norm": 2.734375, "grad_norm_var": 0.010319010416666666, "learning_rate": 0.0001, "loss": 7.7442, "loss/crossentropy": 2.372404932975769, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.22256699949502945, "step": 4976 }, { "epoch": 0.311125, "grad_norm": 5.25, "grad_norm_var": 0.4008534749348958, "learning_rate": 0.0001, "loss": 7.7857, "loss/crossentropy": 2.3137301206588745, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2522178292274475, "step": 4978 }, { "epoch": 0.31125, "grad_norm": 2.765625, "grad_norm_var": 0.3976521809895833, "learning_rate": 0.0001, "loss": 7.8251, "loss/crossentropy": 2.1099445819854736, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23862704634666443, "step": 4980 }, { "epoch": 0.311375, "grad_norm": 2.828125, "grad_norm_var": 0.3995269775390625, "learning_rate": 0.0001, "loss": 7.8625, "loss/crossentropy": 2.373143196105957, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2459760159254074, "step": 4982 }, { "epoch": 0.3115, "grad_norm": 2.703125, "grad_norm_var": 0.4048828125, "learning_rate": 0.0001, "loss": 7.625, "loss/crossentropy": 2.1492116451263428, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23069991171360016, "step": 4984 }, { "epoch": 0.311625, "grad_norm": 2.875, "grad_norm_var": 0.3984771728515625, "learning_rate": 0.0001, "loss": 7.8744, "loss/crossentropy": 2.524581789970398, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24163535982370377, "step": 4986 }, { "epoch": 0.31175, "grad_norm": 2.921875, "grad_norm_var": 0.40279947916666664, "learning_rate": 0.0001, "loss": 7.7391, "loss/crossentropy": 2.0795546770095825, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2258833572268486, "step": 4988 }, { "epoch": 0.311875, "grad_norm": 2.546875, "grad_norm_var": 0.4060943603515625, "learning_rate": 0.0001, "loss": 7.6573, "loss/crossentropy": 2.389248847961426, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2275315672159195, "step": 4990 }, { "epoch": 0.312, "grad_norm": 2.6875, "grad_norm_var": 0.40637613932291666, "learning_rate": 0.0001, "loss": 7.7716, "loss/crossentropy": 2.344989538192749, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.254401370882988, "step": 4992 }, { "epoch": 0.312125, "grad_norm": 2.703125, "grad_norm_var": 0.01148681640625, "learning_rate": 0.0001, "loss": 7.8882, "loss/crossentropy": 2.2956249713897705, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23467770218849182, "step": 4994 }, { "epoch": 0.31225, "grad_norm": 2.75, "grad_norm_var": 0.010868326822916666, "learning_rate": 0.0001, "loss": 7.8919, "loss/crossentropy": 2.2571550607681274, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26118719577789307, "step": 4996 }, { "epoch": 0.312375, "grad_norm": 2.890625, "grad_norm_var": 0.016552734375, "learning_rate": 0.0001, "loss": 7.6358, "loss/crossentropy": 2.2911359071731567, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22085828334093094, "step": 4998 }, { "epoch": 0.3125, "grad_norm": 2.921875, "grad_norm_var": 0.017210896809895834, "learning_rate": 0.0001, "loss": 7.9899, "loss/crossentropy": 2.2595224380493164, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25677596032619476, "step": 5000 }, { "epoch": 0.312625, "grad_norm": 2.6875, "grad_norm_var": 0.016780598958333334, "learning_rate": 0.0001, "loss": 7.7616, "loss/crossentropy": 2.249518036842346, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24523073434829712, "step": 5002 }, { "epoch": 0.31275, "grad_norm": 3.09375, "grad_norm_var": 0.020099894205729166, "learning_rate": 0.0001, "loss": 7.6732, "loss/crossentropy": 2.1288769245147705, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22818374633789062, "step": 5004 }, { "epoch": 0.312875, "grad_norm": 2.734375, "grad_norm_var": 0.017438761393229165, "learning_rate": 0.0001, "loss": 7.7666, "loss/crossentropy": 2.4175007343292236, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2511685714125633, "step": 5006 }, { "epoch": 0.313, "grad_norm": 2.609375, "grad_norm_var": 0.019733683268229166, "learning_rate": 0.0001, "loss": 7.7253, "loss/crossentropy": 2.3725426197052, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2565712630748749, "step": 5008 }, { "epoch": 0.313125, "grad_norm": 2.71875, "grad_norm_var": 0.0180084228515625, "learning_rate": 0.0001, "loss": 7.6834, "loss/crossentropy": 2.1959269046783447, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2452130764722824, "step": 5010 }, { "epoch": 0.31325, "grad_norm": 2.65625, "grad_norm_var": 0.0213043212890625, "learning_rate": 0.0001, "loss": 7.8012, "loss/crossentropy": 2.3894975185394287, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23972920328378677, "step": 5012 }, { "epoch": 0.313375, "grad_norm": 2.953125, "grad_norm_var": 0.019527180989583334, "learning_rate": 0.0001, "loss": 7.7292, "loss/crossentropy": 2.239388346672058, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22415932267904282, "step": 5014 }, { "epoch": 0.3135, "grad_norm": 2.859375, "grad_norm_var": 0.018358357747395835, "learning_rate": 0.0001, "loss": 8.0838, "loss/crossentropy": 2.3887226581573486, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24422915279865265, "step": 5016 }, { "epoch": 0.313625, "grad_norm": 2.453125, "grad_norm_var": 0.024364217122395834, "learning_rate": 0.0001, "loss": 7.4677, "loss/crossentropy": 2.274713635444641, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24600080400705338, "step": 5018 }, { "epoch": 0.31375, "grad_norm": 2.703125, "grad_norm_var": 0.015234375, "learning_rate": 0.0001, "loss": 7.7721, "loss/crossentropy": 2.474233388900757, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24578053504228592, "step": 5020 }, { "epoch": 0.313875, "grad_norm": 2.796875, "grad_norm_var": 0.012886555989583333, "learning_rate": 0.0001, "loss": 7.9269, "loss/crossentropy": 2.5158610343933105, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.27125994861125946, "step": 5022 }, { "epoch": 0.314, "grad_norm": 3.15625, "grad_norm_var": 0.03172098795572917, "learning_rate": 0.0001, "loss": 7.6513, "loss/crossentropy": 2.314492344856262, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2252868115901947, "step": 5024 }, { "epoch": 0.314125, "grad_norm": 3.5625, "grad_norm_var": 0.07766825358072917, "learning_rate": 0.0001, "loss": 7.9754, "loss/crossentropy": 2.2293580770492554, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2425660938024521, "step": 5026 }, { "epoch": 0.31425, "grad_norm": 2.921875, "grad_norm_var": 0.0713043212890625, "learning_rate": 0.0001, "loss": 7.723, "loss/crossentropy": 2.21551775932312, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24313673377037048, "step": 5028 }, { "epoch": 0.314375, "grad_norm": 2.78125, "grad_norm_var": 0.0704742431640625, "learning_rate": 0.0001, "loss": 7.9108, "loss/crossentropy": 2.395055890083313, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23280347138643265, "step": 5030 }, { "epoch": 0.3145, "grad_norm": 2.734375, "grad_norm_var": 0.07105204264322916, "learning_rate": 0.0001, "loss": 7.8214, "loss/crossentropy": 2.396549344062805, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23812869936227798, "step": 5032 }, { "epoch": 0.314625, "grad_norm": 2.65625, "grad_norm_var": 0.0636871337890625, "learning_rate": 0.0001, "loss": 7.8553, "loss/crossentropy": 2.134924054145813, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24307020008563995, "step": 5034 }, { "epoch": 0.31475, "grad_norm": 2.6875, "grad_norm_var": 0.06381734212239583, "learning_rate": 0.0001, "loss": 7.7512, "loss/crossentropy": 2.1205564737319946, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2432844191789627, "step": 5036 }, { "epoch": 0.314875, "grad_norm": 2.734375, "grad_norm_var": 0.06396484375, "learning_rate": 0.0001, "loss": 7.79, "loss/crossentropy": 2.3331637382507324, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.25594358891248703, "step": 5038 }, { "epoch": 0.315, "grad_norm": 2.578125, "grad_norm_var": 0.065625, "learning_rate": 0.0001, "loss": 7.4795, "loss/crossentropy": 2.1194041967391968, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.21089819073677063, "step": 5040 }, { "epoch": 0.315125, "grad_norm": 3.03125, "grad_norm_var": 0.0417144775390625, "learning_rate": 0.0001, "loss": 7.6355, "loss/crossentropy": 2.170634150505066, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24634595215320587, "step": 5042 }, { "epoch": 0.31525, "grad_norm": 2.65625, "grad_norm_var": 0.0422027587890625, "learning_rate": 0.0001, "loss": 7.6986, "loss/crossentropy": 2.4505457878112793, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2487524002790451, "step": 5044 }, { "epoch": 0.315375, "grad_norm": 2.984375, "grad_norm_var": 0.04443359375, "learning_rate": 0.0001, "loss": 7.8527, "loss/crossentropy": 2.162651538848877, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23960895836353302, "step": 5046 }, { "epoch": 0.3155, "grad_norm": 2.578125, "grad_norm_var": 0.04781494140625, "learning_rate": 0.0001, "loss": 7.6415, "loss/crossentropy": 2.2263429164886475, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24099727720022202, "step": 5048 }, { "epoch": 0.315625, "grad_norm": 2.671875, "grad_norm_var": 0.04655659993489583, "learning_rate": 0.0001, "loss": 7.6693, "loss/crossentropy": 1.9957892298698425, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22675327211618423, "step": 5050 }, { "epoch": 0.31575, "grad_norm": 2.59375, "grad_norm_var": 0.051005045572916664, "learning_rate": 0.0001, "loss": 7.7113, "loss/crossentropy": 2.2677851915359497, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21885111927986145, "step": 5052 }, { "epoch": 0.315875, "grad_norm": 2.765625, "grad_norm_var": 0.055562337239583336, "learning_rate": 0.0001, "loss": 7.4862, "loss/crossentropy": 1.9054036736488342, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.20687615871429443, "step": 5054 }, { "epoch": 0.316, "grad_norm": 2.828125, "grad_norm_var": 0.05064188639322917, "learning_rate": 0.0001, "loss": 7.7715, "loss/crossentropy": 2.22283399105072, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23193835467100143, "step": 5056 }, { "epoch": 0.316125, "grad_norm": 2.703125, "grad_norm_var": 0.022215779622395834, "learning_rate": 0.0001, "loss": 7.8363, "loss/crossentropy": 2.417159676551819, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2365451157093048, "step": 5058 }, { "epoch": 0.31625, "grad_norm": 2.796875, "grad_norm_var": 0.016844685872395834, "learning_rate": 0.0001, "loss": 7.7665, "loss/crossentropy": 2.2118507623672485, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2214958816766739, "step": 5060 }, { "epoch": 0.316375, "grad_norm": 2.6875, "grad_norm_var": 0.011116536458333333, "learning_rate": 0.0001, "loss": 7.8195, "loss/crossentropy": 2.145191192626953, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.21712714433670044, "step": 5062 }, { "epoch": 0.3165, "grad_norm": 2.6875, "grad_norm_var": 0.012596638997395833, "learning_rate": 0.0001, "loss": 7.9165, "loss/crossentropy": 2.518553376197815, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.27052685618400574, "step": 5064 }, { "epoch": 0.316625, "grad_norm": 2.78125, "grad_norm_var": 0.009675089518229167, "learning_rate": 0.0001, "loss": 7.6259, "loss/crossentropy": 2.288515567779541, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23619475960731506, "step": 5066 }, { "epoch": 0.31675, "grad_norm": 2.765625, "grad_norm_var": 0.009663899739583334, "learning_rate": 0.0001, "loss": 7.5666, "loss/crossentropy": 2.4803104400634766, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23813430964946747, "step": 5068 }, { "epoch": 0.316875, "grad_norm": 2.703125, "grad_norm_var": 0.0070709228515625, "learning_rate": 0.0001, "loss": 7.7796, "loss/crossentropy": 2.377394676208496, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2384147122502327, "step": 5070 }, { "epoch": 0.317, "grad_norm": 2.609375, "grad_norm_var": 0.007421875, "learning_rate": 0.0001, "loss": 7.5268, "loss/crossentropy": 2.0047815442085266, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23423293977975845, "step": 5072 }, { "epoch": 0.317125, "grad_norm": 2.734375, "grad_norm_var": 0.007697550455729166, "learning_rate": 0.0001, "loss": 7.8697, "loss/crossentropy": 2.261942982673645, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25034070760011673, "step": 5074 }, { "epoch": 0.31725, "grad_norm": 2.90625, "grad_norm_var": 0.009684244791666666, "learning_rate": 0.0001, "loss": 7.713, "loss/crossentropy": 2.32420015335083, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22990263998508453, "step": 5076 }, { "epoch": 0.317375, "grad_norm": 2.734375, "grad_norm_var": 0.05944010416666667, "learning_rate": 0.0001, "loss": 7.8802, "loss/crossentropy": 2.4876354932785034, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2322482466697693, "step": 5078 }, { "epoch": 0.3175, "grad_norm": 2.828125, "grad_norm_var": 0.056428019205729166, "learning_rate": 0.0001, "loss": 7.9242, "loss/crossentropy": 2.364230155944824, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.23250942677259445, "step": 5080 }, { "epoch": 0.317625, "grad_norm": 2.515625, "grad_norm_var": 0.061310831705729166, "learning_rate": 0.0001, "loss": 7.563, "loss/crossentropy": 2.2485674619674683, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24100147932767868, "step": 5082 }, { "epoch": 0.31775, "grad_norm": 2.84375, "grad_norm_var": 0.05921223958333333, "learning_rate": 0.0001, "loss": 7.9548, "loss/crossentropy": 2.294622540473938, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2590477392077446, "step": 5084 }, { "epoch": 0.317875, "grad_norm": 2.78125, "grad_norm_var": 0.0569732666015625, "learning_rate": 0.0001, "loss": 7.8819, "loss/crossentropy": 2.497892141342163, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.25125233083963394, "step": 5086 }, { "epoch": 0.318, "grad_norm": 2.671875, "grad_norm_var": 0.055939737955729166, "learning_rate": 0.0001, "loss": 7.7125, "loss/crossentropy": 2.3443859815597534, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2524673342704773, "step": 5088 }, { "epoch": 0.318125, "grad_norm": 2.59375, "grad_norm_var": 0.05676676432291667, "learning_rate": 0.0001, "loss": 7.7918, "loss/crossentropy": 2.3130171298980713, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2431221380829811, "step": 5090 }, { "epoch": 0.31825, "grad_norm": 2.84375, "grad_norm_var": 0.057352701822916664, "learning_rate": 0.0001, "loss": 8.0047, "loss/crossentropy": 2.4693344831466675, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.3447949290275574, "step": 5092 }, { "epoch": 0.318375, "grad_norm": 2.96875, "grad_norm_var": 0.017692057291666667, "learning_rate": 0.0001, "loss": 7.6993, "loss/crossentropy": 2.1724804639816284, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2206498384475708, "step": 5094 }, { "epoch": 0.3185, "grad_norm": 2.703125, "grad_norm_var": 0.016141764322916665, "learning_rate": 0.0001, "loss": 7.7238, "loss/crossentropy": 2.0963892936706543, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.21788883954286575, "step": 5096 }, { "epoch": 0.318625, "grad_norm": 2.90625, "grad_norm_var": 0.012662760416666667, "learning_rate": 0.0001, "loss": 7.685, "loss/crossentropy": 2.123319149017334, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21226254105567932, "step": 5098 }, { "epoch": 0.31875, "grad_norm": 2.5625, "grad_norm_var": 0.019169108072916666, "learning_rate": 0.0001, "loss": 7.5787, "loss/crossentropy": 2.158640444278717, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24475891888141632, "step": 5100 }, { "epoch": 0.318875, "grad_norm": 2.5, "grad_norm_var": 0.025340779622395834, "learning_rate": 0.0001, "loss": 7.5252, "loss/crossentropy": 2.1854825019836426, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22802812606096268, "step": 5102 }, { "epoch": 0.319, "grad_norm": 2.671875, "grad_norm_var": 0.025581868489583333, "learning_rate": 0.0001, "loss": 7.7928, "loss/crossentropy": 1.9574860334396362, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24525155127048492, "step": 5104 }, { "epoch": 0.319125, "grad_norm": 2.640625, "grad_norm_var": 0.025340779622395834, "learning_rate": 0.0001, "loss": 7.5918, "loss/crossentropy": 2.148212254047394, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23015128076076508, "step": 5106 }, { "epoch": 0.31925, "grad_norm": 2.890625, "grad_norm_var": 0.024247233072916666, "learning_rate": 0.0001, "loss": 7.7043, "loss/crossentropy": 2.308470845222473, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24996764212846756, "step": 5108 }, { "epoch": 0.319375, "grad_norm": 2.625, "grad_norm_var": 0.01715087890625, "learning_rate": 0.0001, "loss": 7.7892, "loss/crossentropy": 2.1219321489334106, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2263498604297638, "step": 5110 }, { "epoch": 0.3195, "grad_norm": 3.046875, "grad_norm_var": 0.030106608072916666, "learning_rate": 0.0001, "loss": 7.8395, "loss/crossentropy": 2.2513787746429443, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22190719842910767, "step": 5112 }, { "epoch": 0.319625, "grad_norm": 2.640625, "grad_norm_var": 0.0299713134765625, "learning_rate": 0.0001, "loss": 7.7326, "loss/crossentropy": 2.4013614654541016, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25737185776233673, "step": 5114 }, { "epoch": 0.31975, "grad_norm": 2.859375, "grad_norm_var": 0.028889973958333332, "learning_rate": 0.0001, "loss": 7.9809, "loss/crossentropy": 2.4230066537857056, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23899466544389725, "step": 5116 }, { "epoch": 0.319875, "grad_norm": 2.59375, "grad_norm_var": 0.024739583333333332, "learning_rate": 0.0001, "loss": 7.6865, "loss/crossentropy": 2.0802340507507324, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.25623180717229843, "step": 5118 }, { "epoch": 0.32, "grad_norm": 2.75, "grad_norm_var": 0.025617472330729165, "learning_rate": 0.0001, "loss": 7.6539, "loss/crossentropy": 2.1477943658828735, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23491913080215454, "step": 5120 }, { "epoch": 0.320125, "grad_norm": 4.1875, "grad_norm_var": 0.15806884765625, "learning_rate": 0.0001, "loss": 7.6989, "loss/crossentropy": 2.291605234146118, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22172336280345917, "step": 5122 }, { "epoch": 0.32025, "grad_norm": 3.0625, "grad_norm_var": 0.15554097493489583, "learning_rate": 0.0001, "loss": 7.6641, "loss/crossentropy": 2.0170804262161255, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23731420189142227, "step": 5124 }, { "epoch": 0.320375, "grad_norm": 2.859375, "grad_norm_var": 0.15526936848958334, "learning_rate": 0.0001, "loss": 7.7426, "loss/crossentropy": 2.276698112487793, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22097814083099365, "step": 5126 }, { "epoch": 0.3205, "grad_norm": 3.03125, "grad_norm_var": 0.15568033854166666, "learning_rate": 0.0001, "loss": 7.8328, "loss/crossentropy": 2.2059558629989624, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23666708171367645, "step": 5128 }, { "epoch": 0.320625, "grad_norm": 2.984375, "grad_norm_var": 0.15746968587239582, "learning_rate": 0.0001, "loss": 7.7709, "loss/crossentropy": 2.4474403858184814, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2520660534501076, "step": 5130 }, { "epoch": 0.32075, "grad_norm": 2.6875, "grad_norm_var": 0.1659332275390625, "learning_rate": 0.0001, "loss": 7.8644, "loss/crossentropy": 2.1484639644622803, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.264514684677124, "step": 5132 }, { "epoch": 0.320875, "grad_norm": 3.3125, "grad_norm_var": 0.17473551432291667, "learning_rate": 0.0001, "loss": 7.7105, "loss/crossentropy": 2.0804697275161743, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23252758383750916, "step": 5134 }, { "epoch": 0.321, "grad_norm": 2.796875, "grad_norm_var": 0.1693267822265625, "learning_rate": 0.0001, "loss": 7.8483, "loss/crossentropy": 2.422489643096924, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2377927079796791, "step": 5136 }, { "epoch": 0.321125, "grad_norm": 2.65625, "grad_norm_var": 0.05426025390625, "learning_rate": 0.0001, "loss": 7.6084, "loss/crossentropy": 2.294679641723633, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22832678258419037, "step": 5138 }, { "epoch": 0.32125, "grad_norm": 2.8125, "grad_norm_var": 0.0474273681640625, "learning_rate": 0.0001, "loss": 7.8155, "loss/crossentropy": 2.2104777097702026, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23934491723775864, "step": 5140 }, { "epoch": 0.321375, "grad_norm": 2.765625, "grad_norm_var": 0.0420318603515625, "learning_rate": 0.0001, "loss": 7.8261, "loss/crossentropy": 2.279021143913269, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2535970136523247, "step": 5142 }, { "epoch": 0.3215, "grad_norm": 2.6875, "grad_norm_var": 0.0413238525390625, "learning_rate": 0.0001, "loss": 7.8555, "loss/crossentropy": 2.349562406539917, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22609640657901764, "step": 5144 }, { "epoch": 0.321625, "grad_norm": 2.71875, "grad_norm_var": 0.037093098958333334, "learning_rate": 0.0001, "loss": 7.5736, "loss/crossentropy": 2.4062600135803223, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2405339851975441, "step": 5146 }, { "epoch": 0.32175, "grad_norm": 2.890625, "grad_norm_var": 0.034684244791666666, "learning_rate": 0.0001, "loss": 7.8345, "loss/crossentropy": 2.3120803833007812, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24249491095542908, "step": 5148 }, { "epoch": 0.321875, "grad_norm": 2.734375, "grad_norm_var": 0.015283203125, "learning_rate": 0.0001, "loss": 7.8181, "loss/crossentropy": 2.2951018810272217, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22178703546524048, "step": 5150 }, { "epoch": 0.322, "grad_norm": 2.671875, "grad_norm_var": 0.015283203125, "learning_rate": 0.0001, "loss": 7.6344, "loss/crossentropy": 2.367525100708008, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22857095301151276, "step": 5152 }, { "epoch": 0.322125, "grad_norm": 2.625, "grad_norm_var": 0.010380045572916666, "learning_rate": 0.0001, "loss": 7.6356, "loss/crossentropy": 2.222907304763794, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22237370908260345, "step": 5154 }, { "epoch": 0.32225, "grad_norm": 2.875, "grad_norm_var": 0.011815388997395834, "learning_rate": 0.0001, "loss": 7.6395, "loss/crossentropy": 2.2401821613311768, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26902496814727783, "step": 5156 }, { "epoch": 0.322375, "grad_norm": 2.609375, "grad_norm_var": 0.011522420247395833, "learning_rate": 0.0001, "loss": 7.6737, "loss/crossentropy": 2.1432089805603027, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23258410394191742, "step": 5158 }, { "epoch": 0.3225, "grad_norm": 2.859375, "grad_norm_var": 0.0121490478515625, "learning_rate": 0.0001, "loss": 7.6745, "loss/crossentropy": 2.2702345848083496, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23792298138141632, "step": 5160 }, { "epoch": 0.322625, "grad_norm": 2.703125, "grad_norm_var": 0.014632161458333333, "learning_rate": 0.0001, "loss": 7.7568, "loss/crossentropy": 2.303900957107544, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2551003098487854, "step": 5162 }, { "epoch": 0.32275, "grad_norm": 2.890625, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 7.8105, "loss/crossentropy": 2.132431387901306, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22536294162273407, "step": 5164 }, { "epoch": 0.322875, "grad_norm": 2.796875, "grad_norm_var": 0.013277180989583333, "learning_rate": 0.0001, "loss": 7.5825, "loss/crossentropy": 2.380275845527649, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.25398004055023193, "step": 5166 }, { "epoch": 0.323, "grad_norm": 2.6875, "grad_norm_var": 0.013133748372395834, "learning_rate": 0.0001, "loss": 7.8355, "loss/crossentropy": 2.1352909207344055, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.26215773820877075, "step": 5168 }, { "epoch": 0.323125, "grad_norm": 2.640625, "grad_norm_var": 0.008698527018229167, "learning_rate": 0.0001, "loss": 7.4394, "loss/crossentropy": 2.131700277328491, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23099368065595627, "step": 5170 }, { "epoch": 0.32325, "grad_norm": 2.859375, "grad_norm_var": 0.008763631184895834, "learning_rate": 0.0001, "loss": 7.7516, "loss/crossentropy": 2.2704739570617676, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2388148009777069, "step": 5172 }, { "epoch": 0.323375, "grad_norm": 2.625, "grad_norm_var": 0.009663899739583334, "learning_rate": 0.0001, "loss": 7.5846, "loss/crossentropy": 2.523204207420349, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23497751355171204, "step": 5174 }, { "epoch": 0.3235, "grad_norm": 2.71875, "grad_norm_var": 0.008812459309895833, "learning_rate": 0.0001, "loss": 7.8189, "loss/crossentropy": 2.23435378074646, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27021078765392303, "step": 5176 }, { "epoch": 0.323625, "grad_norm": 2.84375, "grad_norm_var": 0.010270182291666667, "learning_rate": 0.0001, "loss": 7.6161, "loss/crossentropy": 2.3979127407073975, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23345688730478287, "step": 5178 }, { "epoch": 0.32375, "grad_norm": 2.90625, "grad_norm_var": 0.011066691080729166, "learning_rate": 0.0001, "loss": 7.8091, "loss/crossentropy": 2.387367844581604, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2526174485683441, "step": 5180 }, { "epoch": 0.323875, "grad_norm": 2.734375, "grad_norm_var": 0.011311848958333334, "learning_rate": 0.0001, "loss": 7.8055, "loss/crossentropy": 2.269148826599121, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2551468312740326, "step": 5182 }, { "epoch": 0.324, "grad_norm": 2.59375, "grad_norm_var": 0.013960774739583333, "learning_rate": 0.0001, "loss": 7.5131, "loss/crossentropy": 2.0708478689193726, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2231597974896431, "step": 5184 }, { "epoch": 0.324125, "grad_norm": 2.703125, "grad_norm_var": 0.01484375, "learning_rate": 0.0001, "loss": 7.8128, "loss/crossentropy": 2.3981924057006836, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24011054635047913, "step": 5186 }, { "epoch": 0.32425, "grad_norm": 2.78125, "grad_norm_var": 0.013988240559895834, "learning_rate": 0.0001, "loss": 7.8012, "loss/crossentropy": 2.1334726810455322, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23366482555866241, "step": 5188 }, { "epoch": 0.324375, "grad_norm": 2.671875, "grad_norm_var": 0.013060506184895833, "learning_rate": 0.0001, "loss": 7.8295, "loss/crossentropy": 2.2415345311164856, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24810035526752472, "step": 5190 }, { "epoch": 0.3245, "grad_norm": 2.75, "grad_norm_var": 0.013996378580729166, "learning_rate": 0.0001, "loss": 8.1499, "loss/crossentropy": 2.7943782806396484, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2604614943265915, "step": 5192 }, { "epoch": 0.324625, "grad_norm": 2.75, "grad_norm_var": 0.010282389322916667, "learning_rate": 0.0001, "loss": 7.7283, "loss/crossentropy": 2.5183229446411133, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23073634505271912, "step": 5194 }, { "epoch": 0.32475, "grad_norm": 2.84375, "grad_norm_var": 0.009504191080729167, "learning_rate": 0.0001, "loss": 7.6065, "loss/crossentropy": 2.345661163330078, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23722659796476364, "step": 5196 }, { "epoch": 0.324875, "grad_norm": 2.8125, "grad_norm_var": 0.010188802083333334, "learning_rate": 0.0001, "loss": 7.7939, "loss/crossentropy": 2.1507351398468018, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2460515797138214, "step": 5198 }, { "epoch": 0.325, "grad_norm": 2.796875, "grad_norm_var": 0.00601806640625, "learning_rate": 0.0001, "loss": 7.6696, "loss/crossentropy": 2.1691447496414185, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2416650354862213, "step": 5200 }, { "epoch": 0.325125, "grad_norm": 2.65625, "grad_norm_var": 0.0060455322265625, "learning_rate": 0.0001, "loss": 7.8447, "loss/crossentropy": 2.363338589668274, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24518629908561707, "step": 5202 }, { "epoch": 0.32525, "grad_norm": 2.734375, "grad_norm_var": 0.007575480143229166, "learning_rate": 0.0001, "loss": 7.6606, "loss/crossentropy": 2.3963630199432373, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2324187010526657, "step": 5204 }, { "epoch": 0.325375, "grad_norm": 2.65625, "grad_norm_var": 0.007624308268229167, "learning_rate": 0.0001, "loss": 7.7043, "loss/crossentropy": 2.4719268083572388, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24863296747207642, "step": 5206 }, { "epoch": 0.3255, "grad_norm": 2.578125, "grad_norm_var": 0.00738525390625, "learning_rate": 0.0001, "loss": 7.7675, "loss/crossentropy": 2.388221263885498, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.25198640674352646, "step": 5208 }, { "epoch": 0.325625, "grad_norm": 2.578125, "grad_norm_var": 0.008942667643229167, "learning_rate": 0.0001, "loss": 7.6965, "loss/crossentropy": 2.2551814317703247, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22185778617858887, "step": 5210 }, { "epoch": 0.32575, "grad_norm": 2.5625, "grad_norm_var": 0.008658854166666667, "learning_rate": 0.0001, "loss": 7.7933, "loss/crossentropy": 2.290836215019226, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2299043834209442, "step": 5212 }, { "epoch": 0.325875, "grad_norm": 2.796875, "grad_norm_var": 0.006705729166666666, "learning_rate": 0.0001, "loss": 7.7431, "loss/crossentropy": 2.46811842918396, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24950218200683594, "step": 5214 }, { "epoch": 0.326, "grad_norm": 2.609375, "grad_norm_var": 0.006766764322916666, "learning_rate": 0.0001, "loss": 7.6595, "loss/crossentropy": 2.0848947763442993, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22587470710277557, "step": 5216 }, { "epoch": 0.326125, "grad_norm": 2.640625, "grad_norm_var": 0.006371053059895834, "learning_rate": 0.0001, "loss": 7.5255, "loss/crossentropy": 2.2194297313690186, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23032743483781815, "step": 5218 }, { "epoch": 0.32625, "grad_norm": 2.734375, "grad_norm_var": 0.0061920166015625, "learning_rate": 0.0001, "loss": 7.686, "loss/crossentropy": 2.19452440738678, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.26612579822540283, "step": 5220 }, { "epoch": 0.326375, "grad_norm": 2.671875, "grad_norm_var": 0.006200154622395833, "learning_rate": 0.0001, "loss": 7.7468, "loss/crossentropy": 2.138498604297638, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2383286952972412, "step": 5222 }, { "epoch": 0.3265, "grad_norm": 2.796875, "grad_norm_var": 0.0073394775390625, "learning_rate": 0.0001, "loss": 7.8559, "loss/crossentropy": 2.345498204231262, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25634923577308655, "step": 5224 }, { "epoch": 0.326625, "grad_norm": 3.46875, "grad_norm_var": 0.04537760416666667, "learning_rate": 0.0001, "loss": 7.9487, "loss/crossentropy": 2.1613484621047974, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24952052533626556, "step": 5226 }, { "epoch": 0.32675, "grad_norm": 2.875, "grad_norm_var": 0.046483357747395836, "learning_rate": 0.0001, "loss": 7.7318, "loss/crossentropy": 2.4145522117614746, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2497653216123581, "step": 5228 }, { "epoch": 0.326875, "grad_norm": 2.921875, "grad_norm_var": 0.05237223307291667, "learning_rate": 0.0001, "loss": 7.7819, "loss/crossentropy": 2.2038283348083496, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23818396031856537, "step": 5230 }, { "epoch": 0.327, "grad_norm": 2.609375, "grad_norm_var": 0.048388671875, "learning_rate": 0.0001, "loss": 7.6333, "loss/crossentropy": 2.2541314363479614, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23915573209524155, "step": 5232 }, { "epoch": 0.327125, "grad_norm": 2.75, "grad_norm_var": 0.04450581868489583, "learning_rate": 0.0001, "loss": 7.7337, "loss/crossentropy": 2.2965915203094482, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24025513231754303, "step": 5234 }, { "epoch": 0.32725, "grad_norm": 2.6875, "grad_norm_var": 0.04411519368489583, "learning_rate": 0.0001, "loss": 7.6805, "loss/crossentropy": 2.1780813932418823, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23688644170761108, "step": 5236 }, { "epoch": 0.327375, "grad_norm": 2.734375, "grad_norm_var": 0.0496978759765625, "learning_rate": 0.0001, "loss": 7.5945, "loss/crossentropy": 2.3092637062072754, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24303218722343445, "step": 5238 }, { "epoch": 0.3275, "grad_norm": 3.0625, "grad_norm_var": 0.053125, "learning_rate": 0.0001, "loss": 7.9014, "loss/crossentropy": 2.481802463531494, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2627209722995758, "step": 5240 }, { "epoch": 0.327625, "grad_norm": 2.625, "grad_norm_var": 0.0282379150390625, "learning_rate": 0.0001, "loss": 7.8767, "loss/crossentropy": 2.695994734764099, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24784637242555618, "step": 5242 }, { "epoch": 0.32775, "grad_norm": 2.75, "grad_norm_var": 0.032201131184895836, "learning_rate": 0.0001, "loss": 7.6727, "loss/crossentropy": 2.165263533592224, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25491930544376373, "step": 5244 }, { "epoch": 0.327875, "grad_norm": 2.796875, "grad_norm_var": 0.0233306884765625, "learning_rate": 0.0001, "loss": 8.0537, "loss/crossentropy": 2.4574962854385376, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24603377282619476, "step": 5246 }, { "epoch": 0.328, "grad_norm": 2.515625, "grad_norm_var": 0.023486328125, "learning_rate": 0.0001, "loss": 7.7685, "loss/crossentropy": 2.352921485900879, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2197057530283928, "step": 5248 }, { "epoch": 0.328125, "grad_norm": 2.75, "grad_norm_var": 0.024689737955729166, "learning_rate": 0.0001, "loss": 7.559, "loss/crossentropy": 1.9308044910430908, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.216501384973526, "step": 5250 }, { "epoch": 0.32825, "grad_norm": 3.453125, "grad_norm_var": 0.7048248291015625, "learning_rate": 0.0001, "loss": 7.6766, "loss/crossentropy": 2.2286916971206665, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23412299156188965, "step": 5252 }, { "epoch": 0.328375, "grad_norm": 3.03125, "grad_norm_var": 0.6932576497395834, "learning_rate": 0.0001, "loss": 7.8602, "loss/crossentropy": 2.36347496509552, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2449544221162796, "step": 5254 }, { "epoch": 0.3285, "grad_norm": 2.765625, "grad_norm_var": 0.701220703125, "learning_rate": 0.0001, "loss": 7.6955, "loss/crossentropy": 2.3326140642166138, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2574557065963745, "step": 5256 }, { "epoch": 0.328625, "grad_norm": 2.703125, "grad_norm_var": 0.6971638997395834, "learning_rate": 0.0001, "loss": 7.4471, "loss/crossentropy": 1.815782368183136, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.18647972494363785, "step": 5258 }, { "epoch": 0.32875, "grad_norm": 3.21875, "grad_norm_var": 0.6794260660807292, "learning_rate": 0.0001, "loss": 7.5748, "loss/crossentropy": 2.1120768785476685, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21936962753534317, "step": 5260 }, { "epoch": 0.328875, "grad_norm": 3.03125, "grad_norm_var": 0.6756174723307292, "learning_rate": 0.0001, "loss": 7.732, "loss/crossentropy": 2.1649681329727173, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25797802954912186, "step": 5262 }, { "epoch": 0.329, "grad_norm": 2.65625, "grad_norm_var": 0.6711222330729166, "learning_rate": 0.0001, "loss": 7.5741, "loss/crossentropy": 2.120874285697937, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21845345199108124, "step": 5264 }, { "epoch": 0.329125, "grad_norm": 2.78125, "grad_norm_var": 0.6541819254557292, "learning_rate": 0.0001, "loss": 7.7228, "loss/crossentropy": 2.1182167530059814, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2384987398982048, "step": 5266 }, { "epoch": 0.32925, "grad_norm": 2.546875, "grad_norm_var": 0.054850260416666664, "learning_rate": 0.0001, "loss": 7.8669, "loss/crossentropy": 2.253667116165161, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2525147944688797, "step": 5268 }, { "epoch": 0.329375, "grad_norm": 2.671875, "grad_norm_var": 0.027783203125, "learning_rate": 0.0001, "loss": 7.7053, "loss/crossentropy": 2.205715298652649, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23111368715763092, "step": 5270 }, { "epoch": 0.3295, "grad_norm": 2.75, "grad_norm_var": 0.028400675455729166, "learning_rate": 0.0001, "loss": 7.7319, "loss/crossentropy": 2.2363312244415283, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.21432821452617645, "step": 5272 }, { "epoch": 0.329625, "grad_norm": 2.8125, "grad_norm_var": 0.029069010416666666, "learning_rate": 0.0001, "loss": 7.7165, "loss/crossentropy": 2.2053908109664917, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22934895753860474, "step": 5274 }, { "epoch": 0.32975, "grad_norm": 2.546875, "grad_norm_var": 0.0172271728515625, "learning_rate": 0.0001, "loss": 7.7792, "loss/crossentropy": 2.3969067335128784, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2370493859052658, "step": 5276 }, { "epoch": 0.329875, "grad_norm": 2.671875, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 7.8946, "loss/crossentropy": 2.470771551132202, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2473364919424057, "step": 5278 }, { "epoch": 0.33, "grad_norm": 2.65625, "grad_norm_var": 0.01177978515625, "learning_rate": 0.0001, "loss": 7.5666, "loss/crossentropy": 2.298273205757141, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2253274843096733, "step": 5280 }, { "epoch": 0.330125, "grad_norm": 2.609375, "grad_norm_var": 0.008784993489583334, "learning_rate": 0.0001, "loss": 7.6121, "loss/crossentropy": 2.179291784763336, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2208571508526802, "step": 5282 }, { "epoch": 0.33025, "grad_norm": 2.796875, "grad_norm_var": 0.012483723958333333, "learning_rate": 0.0001, "loss": 7.8844, "loss/crossentropy": 2.2645777463912964, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23816916346549988, "step": 5284 }, { "epoch": 0.330375, "grad_norm": 2.71875, "grad_norm_var": 0.0134918212890625, "learning_rate": 0.0001, "loss": 7.659, "loss/crossentropy": 2.4011365175247192, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2329876646399498, "step": 5286 }, { "epoch": 0.3305, "grad_norm": 2.796875, "grad_norm_var": 0.015458170572916667, "learning_rate": 0.0001, "loss": 7.8741, "loss/crossentropy": 2.2827980518341064, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23250605165958405, "step": 5288 }, { "epoch": 0.330625, "grad_norm": 2.625, "grad_norm_var": 0.015104166666666667, "learning_rate": 0.0001, "loss": 7.6374, "loss/crossentropy": 2.2217756509780884, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23373153805732727, "step": 5290 }, { "epoch": 0.33075, "grad_norm": 2.6875, "grad_norm_var": 0.013004557291666666, "learning_rate": 0.0001, "loss": 7.8195, "loss/crossentropy": 2.191245675086975, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24682573229074478, "step": 5292 }, { "epoch": 0.330875, "grad_norm": 2.765625, "grad_norm_var": 0.0130279541015625, "learning_rate": 0.0001, "loss": 7.7493, "loss/crossentropy": 2.2998467683792114, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2470022290945053, "step": 5294 }, { "epoch": 0.331, "grad_norm": 2.78125, "grad_norm_var": 0.01376953125, "learning_rate": 0.0001, "loss": 7.7662, "loss/crossentropy": 2.4754737615585327, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.232720784842968, "step": 5296 }, { "epoch": 0.331125, "grad_norm": 2.640625, "grad_norm_var": 0.015526326497395833, "learning_rate": 0.0001, "loss": 7.7613, "loss/crossentropy": 2.4555588960647583, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23541343957185745, "step": 5298 }, { "epoch": 0.33125, "grad_norm": 2.59375, "grad_norm_var": 0.015966796875, "learning_rate": 0.0001, "loss": 7.6336, "loss/crossentropy": 2.478135347366333, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24744707345962524, "step": 5300 }, { "epoch": 0.331375, "grad_norm": 2.84375, "grad_norm_var": 0.014794921875, "learning_rate": 0.0001, "loss": 7.7296, "loss/crossentropy": 2.2627123594284058, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24574822187423706, "step": 5302 }, { "epoch": 0.3315, "grad_norm": 2.640625, "grad_norm_var": 0.013304646809895833, "learning_rate": 0.0001, "loss": 7.7155, "loss/crossentropy": 2.1669325828552246, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2320648953318596, "step": 5304 }, { "epoch": 0.331625, "grad_norm": 2.625, "grad_norm_var": 0.013597615559895833, "learning_rate": 0.0001, "loss": 7.8774, "loss/crossentropy": 2.386413097381592, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23917317390441895, "step": 5306 }, { "epoch": 0.33175, "grad_norm": 2.78125, "grad_norm_var": 0.013459269205729167, "learning_rate": 0.0001, "loss": 7.7237, "loss/crossentropy": 2.290984869003296, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2152412086725235, "step": 5308 }, { "epoch": 0.331875, "grad_norm": 2.578125, "grad_norm_var": 0.01881103515625, "learning_rate": 0.0001, "loss": 7.5178, "loss/crossentropy": 2.152104139328003, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2216940075159073, "step": 5310 }, { "epoch": 0.332, "grad_norm": 2.78125, "grad_norm_var": 0.01881103515625, "learning_rate": 0.0001, "loss": 7.7865, "loss/crossentropy": 2.0778995752334595, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24253031611442566, "step": 5312 }, { "epoch": 0.332125, "grad_norm": 2.96875, "grad_norm_var": 0.019136555989583335, "learning_rate": 0.0001, "loss": 7.8274, "loss/crossentropy": 2.322032928466797, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2524111419916153, "step": 5314 }, { "epoch": 0.33225, "grad_norm": 2.609375, "grad_norm_var": 0.01865234375, "learning_rate": 0.0001, "loss": 7.6657, "loss/crossentropy": 2.316957473754883, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23442257940769196, "step": 5316 }, { "epoch": 0.332375, "grad_norm": 2.8125, "grad_norm_var": 0.019559733072916665, "learning_rate": 0.0001, "loss": 7.6677, "loss/crossentropy": 2.037115693092346, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24141768366098404, "step": 5318 }, { "epoch": 0.3325, "grad_norm": 2.671875, "grad_norm_var": 0.019234212239583333, "learning_rate": 0.0001, "loss": 7.4972, "loss/crossentropy": 2.126655101776123, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2226468026638031, "step": 5320 }, { "epoch": 0.332625, "grad_norm": 2.625, "grad_norm_var": 0.021761067708333335, "learning_rate": 0.0001, "loss": 7.943, "loss/crossentropy": 2.3077722787857056, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25074855983257294, "step": 5322 }, { "epoch": 0.33275, "grad_norm": 2.671875, "grad_norm_var": 0.021891276041666668, "learning_rate": 0.0001, "loss": 7.7584, "loss/crossentropy": 2.154437303543091, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22180060297250748, "step": 5324 }, { "epoch": 0.332875, "grad_norm": 2.9375, "grad_norm_var": 0.018635050455729166, "learning_rate": 0.0001, "loss": 7.8301, "loss/crossentropy": 2.260456681251526, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.3006417900323868, "step": 5326 }, { "epoch": 0.333, "grad_norm": 2.609375, "grad_norm_var": 0.0174713134765625, "learning_rate": 0.0001, "loss": 7.6532, "loss/crossentropy": 2.245373249053955, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24294578284025192, "step": 5328 }, { "epoch": 0.333125, "grad_norm": 2.515625, "grad_norm_var": 0.0191802978515625, "learning_rate": 0.0001, "loss": 7.6832, "loss/crossentropy": 2.270382881164551, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2192334160208702, "step": 5330 }, { "epoch": 0.33325, "grad_norm": 2.8125, "grad_norm_var": 0.0183258056640625, "learning_rate": 0.0001, "loss": 7.7119, "loss/crossentropy": 2.332479476928711, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24493084847927094, "step": 5332 }, { "epoch": 0.333375, "grad_norm": 2.78125, "grad_norm_var": 0.016486612955729167, "learning_rate": 0.0001, "loss": 7.8076, "loss/crossentropy": 2.1865415573120117, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23635630309581757, "step": 5334 }, { "epoch": 0.3335, "grad_norm": 2.5625, "grad_norm_var": 0.019050089518229167, "learning_rate": 0.0001, "loss": 7.5084, "loss/crossentropy": 2.3751423358917236, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22569116950035095, "step": 5336 }, { "epoch": 0.333625, "grad_norm": 2.6875, "grad_norm_var": 0.01539306640625, "learning_rate": 0.0001, "loss": 7.5883, "loss/crossentropy": 2.1458349227905273, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22991656512022018, "step": 5338 }, { "epoch": 0.33375, "grad_norm": 2.640625, "grad_norm_var": 0.016185506184895834, "learning_rate": 0.0001, "loss": 7.6676, "loss/crossentropy": 2.3511977195739746, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23896098881959915, "step": 5340 }, { "epoch": 0.333875, "grad_norm": 2.71875, "grad_norm_var": 0.0079254150390625, "learning_rate": 0.0001, "loss": 7.6857, "loss/crossentropy": 2.2015466690063477, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24131523072719574, "step": 5342 }, { "epoch": 0.334, "grad_norm": 2.671875, "grad_norm_var": 0.0067535400390625, "learning_rate": 0.0001, "loss": 7.781, "loss/crossentropy": 2.3441271781921387, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23066142201423645, "step": 5344 }, { "epoch": 0.334125, "grad_norm": 2.578125, "grad_norm_var": 0.00504150390625, "learning_rate": 0.0001, "loss": 7.5073, "loss/crossentropy": 2.2468265891075134, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22720026969909668, "step": 5346 }, { "epoch": 0.33425, "grad_norm": 2.8125, "grad_norm_var": 0.0050445556640625, "learning_rate": 0.0001, "loss": 7.6751, "loss/crossentropy": 2.37017023563385, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2387090027332306, "step": 5348 }, { "epoch": 0.334375, "grad_norm": 2.8125, "grad_norm_var": 0.00777587890625, "learning_rate": 0.0001, "loss": 7.7905, "loss/crossentropy": 2.2650283575057983, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2379176989197731, "step": 5350 }, { "epoch": 0.3345, "grad_norm": 2.65625, "grad_norm_var": 0.008739217122395834, "learning_rate": 0.0001, "loss": 7.7017, "loss/crossentropy": 2.2644563913345337, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23334988206624985, "step": 5352 }, { "epoch": 0.334625, "grad_norm": 2.546875, "grad_norm_var": 0.012360636393229167, "learning_rate": 0.0001, "loss": 7.7702, "loss/crossentropy": 2.3412492275238037, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23740653693675995, "step": 5354 }, { "epoch": 0.33475, "grad_norm": 2.671875, "grad_norm_var": 0.011693318684895834, "learning_rate": 0.0001, "loss": 7.6463, "loss/crossentropy": 2.1389544010162354, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21168312430381775, "step": 5356 }, { "epoch": 0.334875, "grad_norm": 2.9375, "grad_norm_var": 0.015250651041666667, "learning_rate": 0.0001, "loss": 7.6978, "loss/crossentropy": 2.3015042543411255, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24111038446426392, "step": 5358 }, { "epoch": 0.335, "grad_norm": 2.46875, "grad_norm_var": 0.020438639322916667, "learning_rate": 0.0001, "loss": 7.7264, "loss/crossentropy": 2.3176828622817993, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2164960652589798, "step": 5360 }, { "epoch": 0.335125, "grad_norm": 2.609375, "grad_norm_var": 0.021187337239583333, "learning_rate": 0.0001, "loss": 7.6704, "loss/crossentropy": 2.3154109716415405, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.252921462059021, "step": 5362 }, { "epoch": 0.33525, "grad_norm": 2.6875, "grad_norm_var": 0.0204742431640625, "learning_rate": 0.0001, "loss": 7.7753, "loss/crossentropy": 2.374959349632263, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2546839714050293, "step": 5364 }, { "epoch": 0.335375, "grad_norm": 2.578125, "grad_norm_var": 0.0185455322265625, "learning_rate": 0.0001, "loss": 7.4059, "loss/crossentropy": 2.176779806613922, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23205051571130753, "step": 5366 }, { "epoch": 0.3355, "grad_norm": 2.578125, "grad_norm_var": 0.0183013916015625, "learning_rate": 0.0001, "loss": 7.7282, "loss/crossentropy": 2.58807635307312, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2593914717435837, "step": 5368 }, { "epoch": 0.335625, "grad_norm": 2.5625, "grad_norm_var": 0.015331013997395834, "learning_rate": 0.0001, "loss": 7.6049, "loss/crossentropy": 2.158120632171631, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2275710627436638, "step": 5370 }, { "epoch": 0.33575, "grad_norm": 2.65625, "grad_norm_var": 0.018505859375, "learning_rate": 0.0001, "loss": 7.8884, "loss/crossentropy": 2.5111550092697144, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2529209405183792, "step": 5372 }, { "epoch": 0.335875, "grad_norm": 2.46875, "grad_norm_var": 0.015576171875, "learning_rate": 0.0001, "loss": 7.6436, "loss/crossentropy": 2.0710654258728027, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24373741447925568, "step": 5374 }, { "epoch": 0.336, "grad_norm": 2.75, "grad_norm_var": 0.01285400390625, "learning_rate": 0.0001, "loss": 7.4313, "loss/crossentropy": 2.2395845651626587, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23073510825634003, "step": 5376 }, { "epoch": 0.336125, "grad_norm": 2.703125, "grad_norm_var": 0.0240142822265625, "learning_rate": 0.0001, "loss": 7.6411, "loss/crossentropy": 2.0867984890937805, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2168048992753029, "step": 5378 }, { "epoch": 0.33625, "grad_norm": 2.5625, "grad_norm_var": 0.034342447916666664, "learning_rate": 0.0001, "loss": 7.5342, "loss/crossentropy": 2.0329999923706055, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.22913432866334915, "step": 5380 }, { "epoch": 0.336375, "grad_norm": 2.671875, "grad_norm_var": 0.03173828125, "learning_rate": 0.0001, "loss": 7.7578, "loss/crossentropy": 2.2586809396743774, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23330243676900864, "step": 5382 }, { "epoch": 0.3365, "grad_norm": 2.53125, "grad_norm_var": 0.032380167643229166, "learning_rate": 0.0001, "loss": 7.5998, "loss/crossentropy": 2.1368556022644043, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.20959025621414185, "step": 5384 }, { "epoch": 0.336625, "grad_norm": 2.578125, "grad_norm_var": 0.033869425455729164, "learning_rate": 0.0001, "loss": 7.5338, "loss/crossentropy": 2.5093750953674316, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.227738156914711, "step": 5386 }, { "epoch": 0.33675, "grad_norm": 2.609375, "grad_norm_var": 0.03860270182291667, "learning_rate": 0.0001, "loss": 7.676, "loss/crossentropy": 2.181707739830017, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2273741215467453, "step": 5388 }, { "epoch": 0.336875, "grad_norm": 2.875, "grad_norm_var": 0.03746744791666667, "learning_rate": 0.0001, "loss": 7.6515, "loss/crossentropy": 2.081053137779236, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22646404802799225, "step": 5390 }, { "epoch": 0.337, "grad_norm": 3.65625, "grad_norm_var": 0.08640848795572917, "learning_rate": 0.0001, "loss": 7.8483, "loss/crossentropy": 2.242596983909607, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24788595736026764, "step": 5392 }, { "epoch": 0.337125, "grad_norm": 2.78125, "grad_norm_var": 0.13919270833333333, "learning_rate": 0.0001, "loss": 8.0067, "loss/crossentropy": 2.3730074167251587, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24407798051834106, "step": 5394 }, { "epoch": 0.33725, "grad_norm": 2.6875, "grad_norm_var": 0.1376373291015625, "learning_rate": 0.0001, "loss": 7.8557, "loss/crossentropy": 2.4030107259750366, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25184108316898346, "step": 5396 }, { "epoch": 0.337375, "grad_norm": 2.625, "grad_norm_var": 0.14401041666666667, "learning_rate": 0.0001, "loss": 7.6555, "loss/crossentropy": 2.195927083492279, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2218036949634552, "step": 5398 }, { "epoch": 0.3375, "grad_norm": 2.921875, "grad_norm_var": 0.13635660807291666, "learning_rate": 0.0001, "loss": 7.8714, "loss/crossentropy": 2.524413824081421, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2626534104347229, "step": 5400 }, { "epoch": 0.337625, "grad_norm": 2.796875, "grad_norm_var": 0.11949462890625, "learning_rate": 0.0001, "loss": 7.9501, "loss/crossentropy": 2.1232518553733826, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2492925226688385, "step": 5402 }, { "epoch": 0.33775, "grad_norm": 2.859375, "grad_norm_var": 0.11213785807291667, "learning_rate": 0.0001, "loss": 7.7723, "loss/crossentropy": 2.2592722177505493, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2542496919631958, "step": 5404 }, { "epoch": 0.337875, "grad_norm": 2.890625, "grad_norm_var": 0.1163238525390625, "learning_rate": 0.0001, "loss": 7.6869, "loss/crossentropy": 2.3548309803009033, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23627442121505737, "step": 5406 }, { "epoch": 0.338, "grad_norm": 2.609375, "grad_norm_var": 0.08672587076822917, "learning_rate": 0.0001, "loss": 7.825, "loss/crossentropy": 2.344427466392517, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.253245085477829, "step": 5408 }, { "epoch": 0.338125, "grad_norm": 2.65625, "grad_norm_var": 0.03950907389322917, "learning_rate": 0.0001, "loss": 7.7364, "loss/crossentropy": 2.400504946708679, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23406721651554108, "step": 5410 }, { "epoch": 0.33825, "grad_norm": 2.578125, "grad_norm_var": 0.038590494791666666, "learning_rate": 0.0001, "loss": 7.6006, "loss/crossentropy": 2.2784605026245117, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23298882693052292, "step": 5412 }, { "epoch": 0.338375, "grad_norm": 2.734375, "grad_norm_var": 0.03508707682291667, "learning_rate": 0.0001, "loss": 7.6951, "loss/crossentropy": 2.436177968978882, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.21969770640134811, "step": 5414 }, { "epoch": 0.3385, "grad_norm": 2.625, "grad_norm_var": 0.0228515625, "learning_rate": 0.0001, "loss": 7.6232, "loss/crossentropy": 2.1667908430099487, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23300473392009735, "step": 5416 }, { "epoch": 0.338625, "grad_norm": 2.703125, "grad_norm_var": 0.0233062744140625, "learning_rate": 0.0001, "loss": 7.7133, "loss/crossentropy": 2.2497023344039917, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24329498410224915, "step": 5418 }, { "epoch": 0.33875, "grad_norm": 2.625, "grad_norm_var": 0.0139312744140625, "learning_rate": 0.0001, "loss": 7.7249, "loss/crossentropy": 2.3715399503707886, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23880942165851593, "step": 5420 }, { "epoch": 0.338875, "grad_norm": 2.703125, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 7.5744, "loss/crossentropy": 2.3206390142440796, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24868489801883698, "step": 5422 }, { "epoch": 0.339, "grad_norm": 2.875, "grad_norm_var": 0.008983357747395834, "learning_rate": 0.0001, "loss": 7.7716, "loss/crossentropy": 2.1691564321517944, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24463371187448502, "step": 5424 }, { "epoch": 0.339125, "grad_norm": 2.71875, "grad_norm_var": 0.030882771809895834, "learning_rate": 0.0001, "loss": 7.6772, "loss/crossentropy": 2.4441452026367188, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2452142834663391, "step": 5426 }, { "epoch": 0.33925, "grad_norm": 2.6875, "grad_norm_var": 0.033101399739583336, "learning_rate": 0.0001, "loss": 7.8662, "loss/crossentropy": 2.3408702611923218, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23938198387622833, "step": 5428 }, { "epoch": 0.339375, "grad_norm": 2.6875, "grad_norm_var": 0.03299153645833333, "learning_rate": 0.0001, "loss": 7.7342, "loss/crossentropy": 2.131060838699341, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23268932104110718, "step": 5430 }, { "epoch": 0.3395, "grad_norm": 2.625, "grad_norm_var": 0.03266499837239583, "learning_rate": 0.0001, "loss": 7.6117, "loss/crossentropy": 2.3467212915420532, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24147817492485046, "step": 5432 }, { "epoch": 0.339625, "grad_norm": 2.75, "grad_norm_var": 0.031722005208333334, "learning_rate": 0.0001, "loss": 7.7241, "loss/crossentropy": 2.3200254440307617, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2283915877342224, "step": 5434 }, { "epoch": 0.33975, "grad_norm": 2.5, "grad_norm_var": 0.03491923014322917, "learning_rate": 0.0001, "loss": 7.4539, "loss/crossentropy": 2.1004346013069153, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2240995168685913, "step": 5436 }, { "epoch": 0.339875, "grad_norm": 2.59375, "grad_norm_var": 0.0373199462890625, "learning_rate": 0.0001, "loss": 7.6071, "loss/crossentropy": 2.202645182609558, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22623766958713531, "step": 5438 }, { "epoch": 0.34, "grad_norm": 2.671875, "grad_norm_var": 0.037939453125, "learning_rate": 0.0001, "loss": 7.5276, "loss/crossentropy": 2.21840763092041, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24946682155132294, "step": 5440 }, { "epoch": 0.340125, "grad_norm": 2.625, "grad_norm_var": 0.01646728515625, "learning_rate": 0.0001, "loss": 7.8162, "loss/crossentropy": 2.4105695486068726, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25464296340942383, "step": 5442 }, { "epoch": 0.34025, "grad_norm": 2.578125, "grad_norm_var": 0.006810506184895833, "learning_rate": 0.0001, "loss": 7.579, "loss/crossentropy": 2.0545575618743896, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23225222527980804, "step": 5444 }, { "epoch": 0.340375, "grad_norm": 2.546875, "grad_norm_var": 0.006346638997395833, "learning_rate": 0.0001, "loss": 7.5754, "loss/crossentropy": 1.854706585407257, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.20120730996131897, "step": 5446 }, { "epoch": 0.3405, "grad_norm": 2.609375, "grad_norm_var": 0.006574503580729167, "learning_rate": 0.0001, "loss": 7.5274, "loss/crossentropy": 2.054883599281311, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2169378474354744, "step": 5448 }, { "epoch": 0.340625, "grad_norm": 2.671875, "grad_norm_var": 0.006376139322916667, "learning_rate": 0.0001, "loss": 7.8869, "loss/crossentropy": 2.397868037223816, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24723049998283386, "step": 5450 }, { "epoch": 0.34075, "grad_norm": 2.78125, "grad_norm_var": 0.0064361572265625, "learning_rate": 0.0001, "loss": 7.5371, "loss/crossentropy": 2.2067463397979736, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2284959852695465, "step": 5452 }, { "epoch": 0.340875, "grad_norm": 2.609375, "grad_norm_var": 0.006989542643229167, "learning_rate": 0.0001, "loss": 7.7254, "loss/crossentropy": 2.4336724281311035, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23901217430830002, "step": 5454 }, { "epoch": 0.341, "grad_norm": 2.96875, "grad_norm_var": 0.023029581705729166, "learning_rate": 0.0001, "loss": 8.0668, "loss/crossentropy": 2.345983624458313, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.26578347384929657, "step": 5456 }, { "epoch": 0.341125, "grad_norm": 2.71875, "grad_norm_var": 0.022777303059895834, "learning_rate": 0.0001, "loss": 7.7507, "loss/crossentropy": 2.429108738899231, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24781614542007446, "step": 5458 }, { "epoch": 0.34125, "grad_norm": 2.734375, "grad_norm_var": 0.021598307291666667, "learning_rate": 0.0001, "loss": 7.6805, "loss/crossentropy": 2.5044549703598022, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2291325405240059, "step": 5460 }, { "epoch": 0.341375, "grad_norm": 2.859375, "grad_norm_var": 0.0203765869140625, "learning_rate": 0.0001, "loss": 7.9486, "loss/crossentropy": 2.1825063228607178, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23290787637233734, "step": 5462 }, { "epoch": 0.3415, "grad_norm": 2.78125, "grad_norm_var": 0.016502888997395833, "learning_rate": 0.0001, "loss": 7.6885, "loss/crossentropy": 2.4658403396606445, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24999180436134338, "step": 5464 }, { "epoch": 0.341625, "grad_norm": 2.578125, "grad_norm_var": 0.0194244384765625, "learning_rate": 0.0001, "loss": 7.6018, "loss/crossentropy": 2.316327691078186, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23996131867170334, "step": 5466 }, { "epoch": 0.34175, "grad_norm": 2.65625, "grad_norm_var": 0.019010416666666665, "learning_rate": 0.0001, "loss": 7.7587, "loss/crossentropy": 2.3416613340377808, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23552227765321732, "step": 5468 }, { "epoch": 0.341875, "grad_norm": 2.609375, "grad_norm_var": 0.0199615478515625, "learning_rate": 0.0001, "loss": 7.4249, "loss/crossentropy": 2.285277843475342, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2214520201086998, "step": 5470 }, { "epoch": 0.342, "grad_norm": 2.78125, "grad_norm_var": 0.006371053059895834, "learning_rate": 0.0001, "loss": 7.6742, "loss/crossentropy": 2.165732979774475, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2333008572459221, "step": 5472 }, { "epoch": 0.342125, "grad_norm": 2.5625, "grad_norm_var": 0.0084869384765625, "learning_rate": 0.0001, "loss": 7.5944, "loss/crossentropy": 2.3960988521575928, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2329879254102707, "step": 5474 }, { "epoch": 0.34225, "grad_norm": 2.859375, "grad_norm_var": 0.011872355143229167, "learning_rate": 0.0001, "loss": 7.4578, "loss/crossentropy": 2.2969651222229004, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.21917758882045746, "step": 5476 }, { "epoch": 0.342375, "grad_norm": 2.859375, "grad_norm_var": 0.013971964518229166, "learning_rate": 0.0001, "loss": 7.7337, "loss/crossentropy": 2.3231669664382935, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24239341914653778, "step": 5478 }, { "epoch": 0.3425, "grad_norm": 2.671875, "grad_norm_var": 0.013093058268229167, "learning_rate": 0.0001, "loss": 7.6997, "loss/crossentropy": 2.1854217052459717, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22657964378595352, "step": 5480 }, { "epoch": 0.342625, "grad_norm": 2.484375, "grad_norm_var": 0.014574178059895833, "learning_rate": 0.0001, "loss": 7.8045, "loss/crossentropy": 2.4267548322677612, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22942905873060226, "step": 5482 }, { "epoch": 0.34275, "grad_norm": 2.703125, "grad_norm_var": 0.021467081705729165, "learning_rate": 0.0001, "loss": 7.6234, "loss/crossentropy": 2.251534342765808, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22010570764541626, "step": 5484 }, { "epoch": 0.342875, "grad_norm": 2.71875, "grad_norm_var": 0.020536295572916665, "learning_rate": 0.0001, "loss": 7.7483, "loss/crossentropy": 2.0712852478027344, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24656465649604797, "step": 5486 }, { "epoch": 0.343, "grad_norm": 2.90625, "grad_norm_var": 0.023726399739583334, "learning_rate": 0.0001, "loss": 7.7348, "loss/crossentropy": 2.2492517232894897, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2225281000137329, "step": 5488 }, { "epoch": 0.343125, "grad_norm": 2.765625, "grad_norm_var": 0.018717447916666668, "learning_rate": 0.0001, "loss": 7.7209, "loss/crossentropy": 2.2347571849823, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24701686203479767, "step": 5490 }, { "epoch": 0.34325, "grad_norm": 2.65625, "grad_norm_var": 0.015327962239583333, "learning_rate": 0.0001, "loss": 7.5465, "loss/crossentropy": 2.125634729862213, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2420414388179779, "step": 5492 }, { "epoch": 0.343375, "grad_norm": 2.6875, "grad_norm_var": 0.014679972330729167, "learning_rate": 0.0001, "loss": 7.7669, "loss/crossentropy": 2.2953875064849854, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2414630949497223, "step": 5494 }, { "epoch": 0.3435, "grad_norm": 2.734375, "grad_norm_var": 0.016087849934895832, "learning_rate": 0.0001, "loss": 7.8041, "loss/crossentropy": 2.192203640937805, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.245847687125206, "step": 5496 }, { "epoch": 0.343625, "grad_norm": 2.859375, "grad_norm_var": 0.013346354166666666, "learning_rate": 0.0001, "loss": 7.7218, "loss/crossentropy": 2.2936019897460938, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2146032154560089, "step": 5498 }, { "epoch": 0.34375, "grad_norm": 2.859375, "grad_norm_var": 0.010205078125, "learning_rate": 0.0001, "loss": 7.847, "loss/crossentropy": 2.5002620220184326, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2375916838645935, "step": 5500 }, { "epoch": 0.343875, "grad_norm": 2.59375, "grad_norm_var": 0.0134429931640625, "learning_rate": 0.0001, "loss": 7.4712, "loss/crossentropy": 2.0628748536109924, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22490765899419785, "step": 5502 }, { "epoch": 0.344, "grad_norm": 2.578125, "grad_norm_var": 0.013932291666666667, "learning_rate": 0.0001, "loss": 7.647, "loss/crossentropy": 2.311306357383728, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24860431998968124, "step": 5504 }, { "epoch": 0.344125, "grad_norm": 2.546875, "grad_norm_var": 0.016337076822916668, "learning_rate": 0.0001, "loss": 7.7744, "loss/crossentropy": 2.1533212661743164, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23137922585010529, "step": 5506 }, { "epoch": 0.34425, "grad_norm": 2.921875, "grad_norm_var": 0.01842041015625, "learning_rate": 0.0001, "loss": 7.7863, "loss/crossentropy": 2.1796070337295532, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2618604004383087, "step": 5508 }, { "epoch": 0.344375, "grad_norm": 2.578125, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 7.8429, "loss/crossentropy": 2.3477160930633545, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23746654391288757, "step": 5510 }, { "epoch": 0.3445, "grad_norm": 2.765625, "grad_norm_var": 0.018159993489583335, "learning_rate": 0.0001, "loss": 7.3329, "loss/crossentropy": 1.898864507675171, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.224097341299057, "step": 5512 }, { "epoch": 0.344625, "grad_norm": 2.640625, "grad_norm_var": 0.013720703125, "learning_rate": 0.0001, "loss": 7.8504, "loss/crossentropy": 2.4488145112991333, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2435411736369133, "step": 5514 }, { "epoch": 0.34475, "grad_norm": 2.453125, "grad_norm_var": 0.0146484375, "learning_rate": 0.0001, "loss": 7.6152, "loss/crossentropy": 2.1635671854019165, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2312227562069893, "step": 5516 }, { "epoch": 0.344875, "grad_norm": 2.84375, "grad_norm_var": 0.015999348958333333, "learning_rate": 0.0001, "loss": 7.7936, "loss/crossentropy": 2.3724353313446045, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.26612764596939087, "step": 5518 }, { "epoch": 0.345, "grad_norm": 3.0, "grad_norm_var": 0.0217926025390625, "learning_rate": 0.0001, "loss": 7.9204, "loss/crossentropy": 2.573415517807007, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2438500076532364, "step": 5520 }, { "epoch": 0.345125, "grad_norm": 2.859375, "grad_norm_var": 0.0204986572265625, "learning_rate": 0.0001, "loss": 7.5371, "loss/crossentropy": 2.171481966972351, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22145873308181763, "step": 5522 }, { "epoch": 0.34525, "grad_norm": 3.09375, "grad_norm_var": 0.035319010416666664, "learning_rate": 0.0001, "loss": 8.1001, "loss/crossentropy": 2.1745868921279907, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2202254831790924, "step": 5524 }, { "epoch": 0.345375, "grad_norm": 2.546875, "grad_norm_var": 0.03619384765625, "learning_rate": 0.0001, "loss": 7.7162, "loss/crossentropy": 2.299471855163574, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23935645818710327, "step": 5526 }, { "epoch": 0.3455, "grad_norm": 2.59375, "grad_norm_var": 0.03780924479166667, "learning_rate": 0.0001, "loss": 7.7276, "loss/crossentropy": 2.302342176437378, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22520852833986282, "step": 5528 }, { "epoch": 0.345625, "grad_norm": 2.609375, "grad_norm_var": 0.03815104166666667, "learning_rate": 0.0001, "loss": 7.8024, "loss/crossentropy": 2.2377175092697144, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24126768857240677, "step": 5530 }, { "epoch": 0.34575, "grad_norm": 2.65625, "grad_norm_var": 0.03299153645833333, "learning_rate": 0.0001, "loss": 7.5584, "loss/crossentropy": 2.0724852085113525, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22439610213041306, "step": 5532 }, { "epoch": 0.345875, "grad_norm": 2.59375, "grad_norm_var": 0.039427693684895834, "learning_rate": 0.0001, "loss": 7.4919, "loss/crossentropy": 2.0932891368865967, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21347277611494064, "step": 5534 }, { "epoch": 0.346, "grad_norm": 2.4375, "grad_norm_var": 0.03996988932291667, "learning_rate": 0.0001, "loss": 7.6879, "loss/crossentropy": 2.3268193006515503, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2188659980893135, "step": 5536 }, { "epoch": 0.346125, "grad_norm": 2.734375, "grad_norm_var": 0.04039713541666667, "learning_rate": 0.0001, "loss": 7.5868, "loss/crossentropy": 2.4453797340393066, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23379649221897125, "step": 5538 }, { "epoch": 0.34625, "grad_norm": 2.671875, "grad_norm_var": 0.009528605143229167, "learning_rate": 0.0001, "loss": 7.8, "loss/crossentropy": 2.6451261043548584, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23735536634922028, "step": 5540 }, { "epoch": 0.346375, "grad_norm": 2.578125, "grad_norm_var": 0.00787353515625, "learning_rate": 0.0001, "loss": 7.6444, "loss/crossentropy": 2.2719236612319946, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23783369362354279, "step": 5542 }, { "epoch": 0.3465, "grad_norm": 2.734375, "grad_norm_var": 0.010374959309895833, "learning_rate": 0.0001, "loss": 7.722, "loss/crossentropy": 2.3831781148910522, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24425473809242249, "step": 5544 }, { "epoch": 0.346625, "grad_norm": 2.84375, "grad_norm_var": 0.015608723958333333, "learning_rate": 0.0001, "loss": 7.58, "loss/crossentropy": 1.9597296714782715, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23017365485429764, "step": 5546 }, { "epoch": 0.34675, "grad_norm": 2.78125, "grad_norm_var": 0.018583170572916665, "learning_rate": 0.0001, "loss": 7.8236, "loss/crossentropy": 2.307496428489685, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2376098856329918, "step": 5548 }, { "epoch": 0.346875, "grad_norm": 2.625, "grad_norm_var": 0.016535441080729168, "learning_rate": 0.0001, "loss": 7.6374, "loss/crossentropy": 2.3370879888534546, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22883698344230652, "step": 5550 }, { "epoch": 0.347, "grad_norm": 2.71875, "grad_norm_var": 0.013532511393229167, "learning_rate": 0.0001, "loss": 7.4872, "loss/crossentropy": 2.008628785610199, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.21864137053489685, "step": 5552 }, { "epoch": 0.347125, "grad_norm": 2.703125, "grad_norm_var": 0.011017862955729167, "learning_rate": 0.0001, "loss": 7.8147, "loss/crossentropy": 2.302531361579895, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2429972067475319, "step": 5554 }, { "epoch": 0.34725, "grad_norm": 3.3125, "grad_norm_var": 0.0364166259765625, "learning_rate": 0.0001, "loss": 7.79, "loss/crossentropy": 2.421267032623291, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24757269024848938, "step": 5556 }, { "epoch": 0.347375, "grad_norm": 2.828125, "grad_norm_var": 0.03616129557291667, "learning_rate": 0.0001, "loss": 7.7508, "loss/crossentropy": 2.438343048095703, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.245561845600605, "step": 5558 }, { "epoch": 0.3475, "grad_norm": 2.78125, "grad_norm_var": 0.0375885009765625, "learning_rate": 0.0001, "loss": 7.4635, "loss/crossentropy": 2.2139830589294434, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2317277044057846, "step": 5560 }, { "epoch": 0.347625, "grad_norm": 2.59375, "grad_norm_var": 0.03261617024739583, "learning_rate": 0.0001, "loss": 7.6594, "loss/crossentropy": 2.455079197883606, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23698723316192627, "step": 5562 }, { "epoch": 0.34775, "grad_norm": 2.75, "grad_norm_var": 0.032938639322916664, "learning_rate": 0.0001, "loss": 7.6575, "loss/crossentropy": 2.158273220062256, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22858229279518127, "step": 5564 }, { "epoch": 0.347875, "grad_norm": 2.53125, "grad_norm_var": 0.03599853515625, "learning_rate": 0.0001, "loss": 7.7246, "loss/crossentropy": 2.193575143814087, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2248259112238884, "step": 5566 }, { "epoch": 0.348, "grad_norm": 2.65625, "grad_norm_var": 0.033837890625, "learning_rate": 0.0001, "loss": 7.6693, "loss/crossentropy": 2.2432382106781006, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23841316998004913, "step": 5568 }, { "epoch": 0.348125, "grad_norm": 2.9375, "grad_norm_var": 0.036031087239583336, "learning_rate": 0.0001, "loss": 7.791, "loss/crossentropy": 2.0851770639419556, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24872879683971405, "step": 5570 }, { "epoch": 0.34825, "grad_norm": 2.734375, "grad_norm_var": 0.012580362955729167, "learning_rate": 0.0001, "loss": 7.6344, "loss/crossentropy": 2.4083759784698486, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2341550886631012, "step": 5572 }, { "epoch": 0.348375, "grad_norm": 2.5, "grad_norm_var": 0.015086873372395834, "learning_rate": 0.0001, "loss": 7.8279, "loss/crossentropy": 2.1798534393310547, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.22539126873016357, "step": 5574 }, { "epoch": 0.3485, "grad_norm": 2.640625, "grad_norm_var": 0.019563802083333335, "learning_rate": 0.0001, "loss": 7.415, "loss/crossentropy": 2.1162737607955933, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2339876890182495, "step": 5576 }, { "epoch": 0.348625, "grad_norm": 2.59375, "grad_norm_var": 0.019755045572916668, "learning_rate": 0.0001, "loss": 7.6449, "loss/crossentropy": 2.3344627618789673, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23674559593200684, "step": 5578 }, { "epoch": 0.34875, "grad_norm": 2.671875, "grad_norm_var": 0.019950358072916667, "learning_rate": 0.0001, "loss": 7.6051, "loss/crossentropy": 2.258152961730957, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23867005854845047, "step": 5580 }, { "epoch": 0.348875, "grad_norm": 2.90625, "grad_norm_var": 0.0203521728515625, "learning_rate": 0.0001, "loss": 7.7993, "loss/crossentropy": 2.2969924211502075, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24948721379041672, "step": 5582 }, { "epoch": 0.349, "grad_norm": 2.640625, "grad_norm_var": 0.0210845947265625, "learning_rate": 0.0001, "loss": 7.699, "loss/crossentropy": 2.2272658348083496, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22179457545280457, "step": 5584 }, { "epoch": 0.349125, "grad_norm": 2.578125, "grad_norm_var": 0.018440755208333333, "learning_rate": 0.0001, "loss": 7.6915, "loss/crossentropy": 2.2597315311431885, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23527375608682632, "step": 5586 }, { "epoch": 0.34925, "grad_norm": 2.609375, "grad_norm_var": 0.0184234619140625, "learning_rate": 0.0001, "loss": 7.7685, "loss/crossentropy": 2.2496015429496765, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22667241841554642, "step": 5588 }, { "epoch": 0.349375, "grad_norm": 2.515625, "grad_norm_var": 0.0169586181640625, "learning_rate": 0.0001, "loss": 7.8091, "loss/crossentropy": 2.3241220712661743, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22978024184703827, "step": 5590 }, { "epoch": 0.3495, "grad_norm": 2.59375, "grad_norm_var": 0.012498982747395833, "learning_rate": 0.0001, "loss": 7.7572, "loss/crossentropy": 2.506639838218689, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24642347544431686, "step": 5592 }, { "epoch": 0.349625, "grad_norm": 2.640625, "grad_norm_var": 0.011888631184895833, "learning_rate": 0.0001, "loss": 7.6061, "loss/crossentropy": 2.30548095703125, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23315352946519852, "step": 5594 }, { "epoch": 0.34975, "grad_norm": 3.390625, "grad_norm_var": 24.138179524739584, "learning_rate": 0.0001, "loss": 7.7912, "loss/crossentropy": 2.539211392402649, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24779551476240158, "step": 5596 }, { "epoch": 0.349875, "grad_norm": 2.890625, "grad_norm_var": 24.129605102539063, "learning_rate": 0.0001, "loss": 7.6732, "loss/crossentropy": 2.082237720489502, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24204111099243164, "step": 5598 }, { "epoch": 0.35, "grad_norm": 2.78125, "grad_norm_var": 24.13726806640625, "learning_rate": 0.0001, "loss": 7.757, "loss/crossentropy": 2.3941370248794556, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23599355667829514, "step": 5600 }, { "epoch": 0.350125, "grad_norm": 2.671875, "grad_norm_var": 24.11782938639323, "learning_rate": 0.0001, "loss": 7.8375, "loss/crossentropy": 2.4407711029052734, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2626349776983261, "step": 5602 }, { "epoch": 0.35025, "grad_norm": 2.90625, "grad_norm_var": 24.057103474934895, "learning_rate": 0.0001, "loss": 8.0175, "loss/crossentropy": 2.4861412048339844, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2463313788175583, "step": 5604 }, { "epoch": 0.350375, "grad_norm": 2.5625, "grad_norm_var": 24.068229166666665, "learning_rate": 0.0001, "loss": 7.7747, "loss/crossentropy": 2.4824033975601196, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2544633448123932, "step": 5606 }, { "epoch": 0.3505, "grad_norm": 2.875, "grad_norm_var": 23.994066365559895, "learning_rate": 0.0001, "loss": 7.65, "loss/crossentropy": 2.0816505551338196, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2556811571121216, "step": 5608 }, { "epoch": 0.350625, "grad_norm": 2.59375, "grad_norm_var": 23.978831990559897, "learning_rate": 0.0001, "loss": 7.6959, "loss/crossentropy": 2.11083722114563, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22858330607414246, "step": 5610 }, { "epoch": 0.35075, "grad_norm": 2.921875, "grad_norm_var": 0.015990193684895834, "learning_rate": 0.0001, "loss": 7.4229, "loss/crossentropy": 2.0959490537643433, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21905995905399323, "step": 5612 }, { "epoch": 0.350875, "grad_norm": 2.65625, "grad_norm_var": 0.016044108072916667, "learning_rate": 0.0001, "loss": 7.5769, "loss/crossentropy": 2.231637716293335, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.21779616922140121, "step": 5614 }, { "epoch": 0.351, "grad_norm": 2.546875, "grad_norm_var": 0.018317667643229167, "learning_rate": 0.0001, "loss": 7.6451, "loss/crossentropy": 2.1899375915527344, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24350346624851227, "step": 5616 }, { "epoch": 0.351125, "grad_norm": 2.703125, "grad_norm_var": 0.019612630208333332, "learning_rate": 0.0001, "loss": 7.7244, "loss/crossentropy": 2.357677698135376, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.21006733179092407, "step": 5618 }, { "epoch": 0.35125, "grad_norm": 2.453125, "grad_norm_var": 0.0213531494140625, "learning_rate": 0.0001, "loss": 7.5829, "loss/crossentropy": 2.164807915687561, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23102863132953644, "step": 5620 }, { "epoch": 0.351375, "grad_norm": 2.59375, "grad_norm_var": 0.020921834309895835, "learning_rate": 0.0001, "loss": 7.6785, "loss/crossentropy": 2.3327943086624146, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24539093673229218, "step": 5622 }, { "epoch": 0.3515, "grad_norm": 2.703125, "grad_norm_var": 0.0143951416015625, "learning_rate": 0.0001, "loss": 7.9864, "loss/crossentropy": 2.5400614738464355, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2547309398651123, "step": 5624 }, { "epoch": 0.351625, "grad_norm": 2.65625, "grad_norm_var": 0.013798014322916666, "learning_rate": 0.0001, "loss": 7.881, "loss/crossentropy": 2.5600987672805786, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25740641355514526, "step": 5626 }, { "epoch": 0.35175, "grad_norm": 2.5625, "grad_norm_var": 0.009789021809895833, "learning_rate": 0.0001, "loss": 7.7545, "loss/crossentropy": 2.574188470840454, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2433779612183571, "step": 5628 }, { "epoch": 0.351875, "grad_norm": 2.796875, "grad_norm_var": 0.011750284830729167, "learning_rate": 0.0001, "loss": 7.8818, "loss/crossentropy": 2.468465209007263, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.237942174077034, "step": 5630 }, { "epoch": 0.352, "grad_norm": 2.6875, "grad_norm_var": 0.0111480712890625, "learning_rate": 0.0001, "loss": 7.7991, "loss/crossentropy": 2.1748950481414795, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22886700928211212, "step": 5632 }, { "epoch": 0.352125, "grad_norm": 2.65625, "grad_norm_var": 0.01070556640625, "learning_rate": 0.0001, "loss": 7.4315, "loss/crossentropy": 1.950684905052185, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.19095566868782043, "step": 5634 }, { "epoch": 0.35225, "grad_norm": 2.671875, "grad_norm_var": 0.005517578125, "learning_rate": 0.0001, "loss": 7.9361, "loss/crossentropy": 2.40484356880188, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23828066140413284, "step": 5636 }, { "epoch": 0.352375, "grad_norm": 2.625, "grad_norm_var": 0.005256144205729166, "learning_rate": 0.0001, "loss": 7.4915, "loss/crossentropy": 2.1357630491256714, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22223906219005585, "step": 5638 }, { "epoch": 0.3525, "grad_norm": 2.671875, "grad_norm_var": 0.004865519205729167, "learning_rate": 0.0001, "loss": 7.6243, "loss/crossentropy": 2.375010132789612, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2419091984629631, "step": 5640 }, { "epoch": 0.352625, "grad_norm": 2.734375, "grad_norm_var": 0.004857381184895833, "learning_rate": 0.0001, "loss": 7.835, "loss/crossentropy": 2.385709047317505, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.21495652198791504, "step": 5642 }, { "epoch": 0.35275, "grad_norm": 2.78125, "grad_norm_var": 0.0035319010416666667, "learning_rate": 0.0001, "loss": 7.6076, "loss/crossentropy": 2.6881879568099976, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2391050010919571, "step": 5644 }, { "epoch": 0.352875, "grad_norm": 2.609375, "grad_norm_var": 0.004417928059895834, "learning_rate": 0.0001, "loss": 7.3616, "loss/crossentropy": 2.5481897592544556, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2227291315793991, "step": 5646 }, { "epoch": 0.353, "grad_norm": 2.71875, "grad_norm_var": 0.0048736572265625, "learning_rate": 0.0001, "loss": 7.6351, "loss/crossentropy": 1.8755032420158386, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23144642263650894, "step": 5648 }, { "epoch": 0.353125, "grad_norm": 2.875, "grad_norm_var": 0.008454386393229167, "learning_rate": 0.0001, "loss": 7.6697, "loss/crossentropy": 2.242702841758728, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24922174960374832, "step": 5650 }, { "epoch": 0.35325, "grad_norm": 2.5625, "grad_norm_var": 0.013329060872395833, "learning_rate": 0.0001, "loss": 7.899, "loss/crossentropy": 2.3576923608779907, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23955199122428894, "step": 5652 }, { "epoch": 0.353375, "grad_norm": 2.640625, "grad_norm_var": 0.013313802083333333, "learning_rate": 0.0001, "loss": 7.8294, "loss/crossentropy": 2.255527913570404, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2253405824303627, "step": 5654 }, { "epoch": 0.3535, "grad_norm": 2.546875, "grad_norm_var": 0.0143463134765625, "learning_rate": 0.0001, "loss": 7.5344, "loss/crossentropy": 1.9262883067131042, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.20440469682216644, "step": 5656 }, { "epoch": 0.353625, "grad_norm": 2.71875, "grad_norm_var": 0.016087849934895832, "learning_rate": 0.0001, "loss": 7.66, "loss/crossentropy": 2.170652449131012, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22713115811347961, "step": 5658 }, { "epoch": 0.35375, "grad_norm": 2.671875, "grad_norm_var": 0.019465128580729168, "learning_rate": 0.0001, "loss": 7.8865, "loss/crossentropy": 2.2798702716827393, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.25286709517240524, "step": 5660 }, { "epoch": 0.353875, "grad_norm": 2.484375, "grad_norm_var": 0.025226847330729166, "learning_rate": 0.0001, "loss": 7.4547, "loss/crossentropy": 2.248200535774231, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22316128015518188, "step": 5662 }, { "epoch": 0.354, "grad_norm": 2.625, "grad_norm_var": 0.026981608072916666, "learning_rate": 0.0001, "loss": 7.7952, "loss/crossentropy": 2.4335741996765137, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2533376216888428, "step": 5664 }, { "epoch": 0.354125, "grad_norm": 2.78125, "grad_norm_var": 0.024332682291666668, "learning_rate": 0.0001, "loss": 7.9831, "loss/crossentropy": 2.365623950958252, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23785682022571564, "step": 5666 }, { "epoch": 0.35425, "grad_norm": 2.609375, "grad_norm_var": 0.0218658447265625, "learning_rate": 0.0001, "loss": 7.6136, "loss/crossentropy": 2.2862355709075928, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23654907941818237, "step": 5668 }, { "epoch": 0.354375, "grad_norm": 2.75, "grad_norm_var": 0.021610514322916666, "learning_rate": 0.0001, "loss": 7.7669, "loss/crossentropy": 2.388039708137512, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23072518408298492, "step": 5670 }, { "epoch": 0.3545, "grad_norm": 2.9375, "grad_norm_var": 0.0245269775390625, "learning_rate": 0.0001, "loss": 7.6787, "loss/crossentropy": 2.1460591554641724, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24326714873313904, "step": 5672 }, { "epoch": 0.354625, "grad_norm": 2.59375, "grad_norm_var": 0.02340087890625, "learning_rate": 0.0001, "loss": 7.5565, "loss/crossentropy": 2.267369866371155, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2414206862449646, "step": 5674 }, { "epoch": 0.35475, "grad_norm": 2.9375, "grad_norm_var": 0.02451171875, "learning_rate": 0.0001, "loss": 7.6004, "loss/crossentropy": 2.316736102104187, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2455018386244774, "step": 5676 }, { "epoch": 0.354875, "grad_norm": 2.515625, "grad_norm_var": 0.01783447265625, "learning_rate": 0.0001, "loss": 7.541, "loss/crossentropy": 2.1064939498901367, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21690509468317032, "step": 5678 }, { "epoch": 0.355, "grad_norm": 2.65625, "grad_norm_var": 0.016206868489583335, "learning_rate": 0.0001, "loss": 7.693, "loss/crossentropy": 2.557629704475403, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2511487454175949, "step": 5680 }, { "epoch": 0.355125, "grad_norm": 2.796875, "grad_norm_var": 0.016234334309895834, "learning_rate": 0.0001, "loss": 7.7064, "loss/crossentropy": 2.3363611698150635, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23954744637012482, "step": 5682 }, { "epoch": 0.35525, "grad_norm": 2.84375, "grad_norm_var": 0.015445963541666666, "learning_rate": 0.0001, "loss": 7.8347, "loss/crossentropy": 2.3296741247177124, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2454221546649933, "step": 5684 }, { "epoch": 0.355375, "grad_norm": 2.796875, "grad_norm_var": 0.0158111572265625, "learning_rate": 0.0001, "loss": 7.7695, "loss/crossentropy": 2.2176859378814697, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.25949132442474365, "step": 5686 }, { "epoch": 0.3555, "grad_norm": 2.59375, "grad_norm_var": 0.014371744791666667, "learning_rate": 0.0001, "loss": 7.6318, "loss/crossentropy": 2.0935255885124207, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2221689522266388, "step": 5688 }, { "epoch": 0.355625, "grad_norm": 2.703125, "grad_norm_var": 0.014549763997395833, "learning_rate": 0.0001, "loss": 7.8214, "loss/crossentropy": 2.1757096648216248, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2440442219376564, "step": 5690 }, { "epoch": 0.35575, "grad_norm": 2.796875, "grad_norm_var": 0.0114654541015625, "learning_rate": 0.0001, "loss": 7.7502, "loss/crossentropy": 2.514808773994446, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24424804002046585, "step": 5692 }, { "epoch": 0.355875, "grad_norm": 2.765625, "grad_norm_var": 0.013622029622395834, "learning_rate": 0.0001, "loss": 7.5661, "loss/crossentropy": 2.3097294569015503, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2392885386943817, "step": 5694 }, { "epoch": 0.356, "grad_norm": 2.71875, "grad_norm_var": 0.0136383056640625, "learning_rate": 0.0001, "loss": 7.7194, "loss/crossentropy": 2.285163164138794, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22372599691152573, "step": 5696 }, { "epoch": 0.356125, "grad_norm": 2.703125, "grad_norm_var": 0.017513020833333334, "learning_rate": 0.0001, "loss": 7.3876, "loss/crossentropy": 1.9280012249946594, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2646433413028717, "step": 5698 }, { "epoch": 0.35625, "grad_norm": 2.765625, "grad_norm_var": 0.016266886393229166, "learning_rate": 0.0001, "loss": 7.7874, "loss/crossentropy": 2.2501174211502075, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2359970659017563, "step": 5700 }, { "epoch": 0.356375, "grad_norm": 2.703125, "grad_norm_var": 0.014957682291666666, "learning_rate": 0.0001, "loss": 7.8261, "loss/crossentropy": 2.1803048849105835, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23239052295684814, "step": 5702 }, { "epoch": 0.3565, "grad_norm": 2.78125, "grad_norm_var": 0.015038045247395833, "learning_rate": 0.0001, "loss": 7.5962, "loss/crossentropy": 2.283075451850891, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23558545857667923, "step": 5704 }, { "epoch": 0.356625, "grad_norm": 3.03125, "grad_norm_var": 0.021686808268229166, "learning_rate": 0.0001, "loss": 7.8766, "loss/crossentropy": 2.4021514654159546, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23355989158153534, "step": 5706 }, { "epoch": 0.35675, "grad_norm": 2.484375, "grad_norm_var": 0.02427978515625, "learning_rate": 0.0001, "loss": 7.3758, "loss/crossentropy": 2.2131272554397583, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21188893914222717, "step": 5708 }, { "epoch": 0.356875, "grad_norm": 2.8125, "grad_norm_var": 0.020002237955729165, "learning_rate": 0.0001, "loss": 7.6363, "loss/crossentropy": 2.0030104517936707, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2232322171330452, "step": 5710 }, { "epoch": 0.357, "grad_norm": 2.703125, "grad_norm_var": 0.0199859619140625, "learning_rate": 0.0001, "loss": 7.671, "loss/crossentropy": 2.366120934486389, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2394869551062584, "step": 5712 }, { "epoch": 0.357125, "grad_norm": 2.609375, "grad_norm_var": 0.01666259765625, "learning_rate": 0.0001, "loss": 7.8112, "loss/crossentropy": 2.460438847541809, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24504510313272476, "step": 5714 }, { "epoch": 0.35725, "grad_norm": 2.8125, "grad_norm_var": 0.017415364583333332, "learning_rate": 0.0001, "loss": 7.8684, "loss/crossentropy": 2.2430474758148193, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23894642293453217, "step": 5716 }, { "epoch": 0.357375, "grad_norm": 2.78125, "grad_norm_var": 0.018001302083333334, "learning_rate": 0.0001, "loss": 7.728, "loss/crossentropy": 2.3139495849609375, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23589037358760834, "step": 5718 }, { "epoch": 0.3575, "grad_norm": 2.578125, "grad_norm_var": 0.018675740559895834, "learning_rate": 0.0001, "loss": 7.6179, "loss/crossentropy": 2.396698236465454, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2219223603606224, "step": 5720 }, { "epoch": 0.357625, "grad_norm": 2.8125, "grad_norm_var": 0.0119537353515625, "learning_rate": 0.0001, "loss": 7.7524, "loss/crossentropy": 2.159409284591675, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22207806259393692, "step": 5722 }, { "epoch": 0.35775, "grad_norm": 2.6875, "grad_norm_var": 0.008665974934895833, "learning_rate": 0.0001, "loss": 7.6531, "loss/crossentropy": 2.114900290966034, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21309657394886017, "step": 5724 }, { "epoch": 0.357875, "grad_norm": 2.609375, "grad_norm_var": 0.009007771809895834, "learning_rate": 0.0001, "loss": 7.7299, "loss/crossentropy": 2.370342969894409, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23888298869132996, "step": 5726 }, { "epoch": 0.358, "grad_norm": 3.0, "grad_norm_var": 0.014623006184895834, "learning_rate": 0.0001, "loss": 7.6268, "loss/crossentropy": 2.2870720624923706, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2497321367263794, "step": 5728 }, { "epoch": 0.358125, "grad_norm": 2.84375, "grad_norm_var": 0.016011555989583332, "learning_rate": 0.0001, "loss": 7.7064, "loss/crossentropy": 2.363236665725708, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23332637548446655, "step": 5730 }, { "epoch": 0.35825, "grad_norm": 2.765625, "grad_norm_var": 0.017552693684895832, "learning_rate": 0.0001, "loss": 7.573, "loss/crossentropy": 2.1761854887008667, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22521265596151352, "step": 5732 }, { "epoch": 0.358375, "grad_norm": 2.671875, "grad_norm_var": 0.0168121337890625, "learning_rate": 0.0001, "loss": 7.5486, "loss/crossentropy": 2.4128466844558716, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2225390523672104, "step": 5734 }, { "epoch": 0.3585, "grad_norm": 2.71875, "grad_norm_var": 0.017203776041666667, "learning_rate": 0.0001, "loss": 7.6541, "loss/crossentropy": 2.106870412826538, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2253267988562584, "step": 5736 }, { "epoch": 0.358625, "grad_norm": 2.5, "grad_norm_var": 0.019343058268229168, "learning_rate": 0.0001, "loss": 7.7011, "loss/crossentropy": 2.401144862174988, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24028535187244415, "step": 5738 }, { "epoch": 0.35875, "grad_norm": 2.875, "grad_norm_var": 0.020198567708333334, "learning_rate": 0.0001, "loss": 8.0241, "loss/crossentropy": 2.7318115234375, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26869483292102814, "step": 5740 }, { "epoch": 0.358875, "grad_norm": 2.96875, "grad_norm_var": 0.025511678059895834, "learning_rate": 0.0001, "loss": 7.8367, "loss/crossentropy": 2.3337209224700928, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.25457948446273804, "step": 5742 }, { "epoch": 0.359, "grad_norm": 2.515625, "grad_norm_var": 0.020482381184895832, "learning_rate": 0.0001, "loss": 7.4573, "loss/crossentropy": 2.3648521900177, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2299303039908409, "step": 5744 }, { "epoch": 0.359125, "grad_norm": 2.484375, "grad_norm_var": 0.020442708333333334, "learning_rate": 0.0001, "loss": 7.3538, "loss/crossentropy": 2.2486501932144165, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22633330523967743, "step": 5746 }, { "epoch": 0.35925, "grad_norm": 2.65625, "grad_norm_var": 0.019498697916666665, "learning_rate": 0.0001, "loss": 7.4857, "loss/crossentropy": 2.070233702659607, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22894646972417831, "step": 5748 }, { "epoch": 0.359375, "grad_norm": 2.5, "grad_norm_var": 0.022086588541666667, "learning_rate": 0.0001, "loss": 7.5422, "loss/crossentropy": 2.1959941387176514, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22951291501522064, "step": 5750 }, { "epoch": 0.3595, "grad_norm": 2.671875, "grad_norm_var": 0.020856730143229165, "learning_rate": 0.0001, "loss": 7.76, "loss/crossentropy": 2.3358755111694336, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23019885271787643, "step": 5752 }, { "epoch": 0.359625, "grad_norm": 2.6875, "grad_norm_var": 0.06048177083333333, "learning_rate": 0.0001, "loss": 7.7343, "loss/crossentropy": 2.2709667682647705, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2551580220460892, "step": 5754 }, { "epoch": 0.35975, "grad_norm": 2.59375, "grad_norm_var": 0.059056599934895836, "learning_rate": 0.0001, "loss": 7.6166, "loss/crossentropy": 2.2347922325134277, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23468506336212158, "step": 5756 }, { "epoch": 0.359875, "grad_norm": 2.84375, "grad_norm_var": 0.0555572509765625, "learning_rate": 0.0001, "loss": 7.7217, "loss/crossentropy": 2.171838402748108, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23764898627996445, "step": 5758 }, { "epoch": 0.36, "grad_norm": 2.765625, "grad_norm_var": 0.05357157389322917, "learning_rate": 0.0001, "loss": 7.7349, "loss/crossentropy": 2.2026760578155518, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2261125147342682, "step": 5760 }, { "epoch": 0.360125, "grad_norm": 2.46875, "grad_norm_var": 0.0515045166015625, "learning_rate": 0.0001, "loss": 7.7557, "loss/crossentropy": 2.361696481704712, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2469499111175537, "step": 5762 }, { "epoch": 0.36025, "grad_norm": 2.65625, "grad_norm_var": 0.04901936848958333, "learning_rate": 0.0001, "loss": 7.614, "loss/crossentropy": 2.1273152828216553, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23618457466363907, "step": 5764 }, { "epoch": 0.360375, "grad_norm": 2.890625, "grad_norm_var": 0.04676106770833333, "learning_rate": 0.0001, "loss": 7.8021, "loss/crossentropy": 2.520339846611023, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24465452134609222, "step": 5766 }, { "epoch": 0.3605, "grad_norm": 2.65625, "grad_norm_var": 0.046930948893229164, "learning_rate": 0.0001, "loss": 7.5666, "loss/crossentropy": 2.1336361169815063, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2215401753783226, "step": 5768 }, { "epoch": 0.360625, "grad_norm": 2.75, "grad_norm_var": 0.011693318684895834, "learning_rate": 0.0001, "loss": 7.4997, "loss/crossentropy": 2.202090620994568, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2281825691461563, "step": 5770 }, { "epoch": 0.36075, "grad_norm": 2.71875, "grad_norm_var": 0.009992472330729167, "learning_rate": 0.0001, "loss": 7.7834, "loss/crossentropy": 2.368329644203186, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2590648978948593, "step": 5772 }, { "epoch": 0.360875, "grad_norm": 2.484375, "grad_norm_var": 0.0124176025390625, "learning_rate": 0.0001, "loss": 7.6081, "loss/crossentropy": 2.168250799179077, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2538819760084152, "step": 5774 }, { "epoch": 0.361, "grad_norm": 2.8125, "grad_norm_var": 0.017625935872395835, "learning_rate": 0.0001, "loss": 7.855, "loss/crossentropy": 2.185101628303528, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22262858599424362, "step": 5776 }, { "epoch": 0.361125, "grad_norm": 2.59375, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 7.5825, "loss/crossentropy": 2.2893370389938354, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22537656873464584, "step": 5778 }, { "epoch": 0.36125, "grad_norm": 3.109375, "grad_norm_var": 0.0247955322265625, "learning_rate": 0.0001, "loss": 7.8587, "loss/crossentropy": 2.150850534439087, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2558140307664871, "step": 5780 }, { "epoch": 0.361375, "grad_norm": 2.578125, "grad_norm_var": 0.03717447916666667, "learning_rate": 0.0001, "loss": 7.8579, "loss/crossentropy": 2.2104331254959106, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22599218785762787, "step": 5782 }, { "epoch": 0.3615, "grad_norm": 2.84375, "grad_norm_var": 0.037629191080729166, "learning_rate": 0.0001, "loss": 7.7026, "loss/crossentropy": 2.2741633653640747, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2677592784166336, "step": 5784 }, { "epoch": 0.361625, "grad_norm": 2.609375, "grad_norm_var": 0.0390777587890625, "learning_rate": 0.0001, "loss": 7.8264, "loss/crossentropy": 2.2916316986083984, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2495860978960991, "step": 5786 }, { "epoch": 0.36175, "grad_norm": 2.6875, "grad_norm_var": 0.03955078125, "learning_rate": 0.0001, "loss": 7.5611, "loss/crossentropy": 2.0463147163391113, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23029808700084686, "step": 5788 }, { "epoch": 0.361875, "grad_norm": 2.578125, "grad_norm_var": 0.03502197265625, "learning_rate": 0.0001, "loss": 7.7066, "loss/crossentropy": 2.263827085494995, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23307229578495026, "step": 5790 }, { "epoch": 0.362, "grad_norm": 2.6875, "grad_norm_var": 0.031962076822916664, "learning_rate": 0.0001, "loss": 7.5446, "loss/crossentropy": 2.3556219339370728, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23637986928224564, "step": 5792 }, { "epoch": 0.362125, "grad_norm": 2.671875, "grad_norm_var": 0.029230753580729168, "learning_rate": 0.0001, "loss": 7.6615, "loss/crossentropy": 2.306758999824524, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24254638701677322, "step": 5794 }, { "epoch": 0.36225, "grad_norm": 2.515625, "grad_norm_var": 0.0220855712890625, "learning_rate": 0.0001, "loss": 7.6208, "loss/crossentropy": 2.177502393722534, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2685846984386444, "step": 5796 }, { "epoch": 0.362375, "grad_norm": 2.921875, "grad_norm_var": 0.009566243489583333, "learning_rate": 0.0001, "loss": 7.804, "loss/crossentropy": 2.295432686805725, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23979559540748596, "step": 5798 }, { "epoch": 0.3625, "grad_norm": 2.671875, "grad_norm_var": 0.008133951822916667, "learning_rate": 0.0001, "loss": 7.7852, "loss/crossentropy": 2.237891912460327, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24718661606311798, "step": 5800 }, { "epoch": 0.362625, "grad_norm": 2.65625, "grad_norm_var": 0.007233683268229167, "learning_rate": 0.0001, "loss": 7.8308, "loss/crossentropy": 2.3835798501968384, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23231332749128342, "step": 5802 }, { "epoch": 0.36275, "grad_norm": 2.5625, "grad_norm_var": 0.008259073893229166, "learning_rate": 0.0001, "loss": 7.5373, "loss/crossentropy": 2.254540503025055, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.229118213057518, "step": 5804 }, { "epoch": 0.362875, "grad_norm": 2.71875, "grad_norm_var": 0.008707682291666666, "learning_rate": 0.0001, "loss": 7.6709, "loss/crossentropy": 2.068022668361664, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22857226431369781, "step": 5806 }, { "epoch": 0.363, "grad_norm": 2.5625, "grad_norm_var": 0.011774698893229166, "learning_rate": 0.0001, "loss": 7.575, "loss/crossentropy": 2.3564945459365845, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23851487040519714, "step": 5808 }, { "epoch": 0.363125, "grad_norm": 2.84375, "grad_norm_var": 0.013411458333333333, "learning_rate": 0.0001, "loss": 7.5621, "loss/crossentropy": 2.161305069923401, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23228955268859863, "step": 5810 }, { "epoch": 0.36325, "grad_norm": 2.640625, "grad_norm_var": 0.014697265625, "learning_rate": 0.0001, "loss": 7.8582, "loss/crossentropy": 2.309541344642639, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24874630570411682, "step": 5812 }, { "epoch": 0.363375, "grad_norm": 2.484375, "grad_norm_var": 0.015348307291666667, "learning_rate": 0.0001, "loss": 7.574, "loss/crossentropy": 2.319241166114807, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21827013790607452, "step": 5814 }, { "epoch": 0.3635, "grad_norm": 2.453125, "grad_norm_var": 0.017578125, "learning_rate": 0.0001, "loss": 7.6049, "loss/crossentropy": 2.3824607133865356, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23591673374176025, "step": 5816 }, { "epoch": 0.363625, "grad_norm": 2.796875, "grad_norm_var": 0.0189605712890625, "learning_rate": 0.0001, "loss": 7.6317, "loss/crossentropy": 2.2936993837356567, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22995467483997345, "step": 5818 }, { "epoch": 0.36375, "grad_norm": 2.625, "grad_norm_var": 0.031004842122395834, "learning_rate": 0.0001, "loss": 7.7985, "loss/crossentropy": 2.266105532646179, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2358851656317711, "step": 5820 }, { "epoch": 0.363875, "grad_norm": 2.859375, "grad_norm_var": 0.03280843098958333, "learning_rate": 0.0001, "loss": 7.8707, "loss/crossentropy": 2.484622359275818, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2356501966714859, "step": 5822 }, { "epoch": 0.364, "grad_norm": 2.765625, "grad_norm_var": 0.031525675455729166, "learning_rate": 0.0001, "loss": 7.9127, "loss/crossentropy": 2.4101911783218384, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2482302337884903, "step": 5824 }, { "epoch": 0.364125, "grad_norm": 2.65625, "grad_norm_var": 0.0307281494140625, "learning_rate": 0.0001, "loss": 7.6096, "loss/crossentropy": 2.1108115911483765, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2111392468214035, "step": 5826 }, { "epoch": 0.36425, "grad_norm": 2.671875, "grad_norm_var": 0.0280914306640625, "learning_rate": 0.0001, "loss": 7.7208, "loss/crossentropy": 2.3509035110473633, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24641770124435425, "step": 5828 }, { "epoch": 0.364375, "grad_norm": 2.515625, "grad_norm_var": 0.02603759765625, "learning_rate": 0.0001, "loss": 7.4791, "loss/crossentropy": 2.235402464866638, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2279442772269249, "step": 5830 }, { "epoch": 0.3645, "grad_norm": 2.671875, "grad_norm_var": 0.023356119791666668, "learning_rate": 0.0001, "loss": 7.4913, "loss/crossentropy": 2.2777727842330933, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2562435418367386, "step": 5832 }, { "epoch": 0.364625, "grad_norm": 2.859375, "grad_norm_var": 0.0239898681640625, "learning_rate": 0.0001, "loss": 7.7762, "loss/crossentropy": 2.4849473237991333, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2280806005001068, "step": 5834 }, { "epoch": 0.36475, "grad_norm": 2.46875, "grad_norm_var": 0.019254557291666665, "learning_rate": 0.0001, "loss": 7.5189, "loss/crossentropy": 2.1271677017211914, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21770554780960083, "step": 5836 }, { "epoch": 0.364875, "grad_norm": 2.953125, "grad_norm_var": 0.021207682291666665, "learning_rate": 0.0001, "loss": 7.8169, "loss/crossentropy": 2.4321603775024414, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2932557016611099, "step": 5838 }, { "epoch": 0.365, "grad_norm": 2.6875, "grad_norm_var": 0.018382771809895834, "learning_rate": 0.0001, "loss": 7.6133, "loss/crossentropy": 2.2554616928100586, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24832912534475327, "step": 5840 }, { "epoch": 0.365125, "grad_norm": 2.703125, "grad_norm_var": 0.035643513997395834, "learning_rate": 0.0001, "loss": 7.5574, "loss/crossentropy": 2.1558557748794556, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23103642463684082, "step": 5842 }, { "epoch": 0.36525, "grad_norm": 2.46875, "grad_norm_var": 0.03974507649739583, "learning_rate": 0.0001, "loss": 7.4915, "loss/crossentropy": 2.0305283665657043, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22221823036670685, "step": 5844 }, { "epoch": 0.365375, "grad_norm": 2.609375, "grad_norm_var": 0.03975321451822917, "learning_rate": 0.0001, "loss": 7.5188, "loss/crossentropy": 2.2241755723953247, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24019160121679306, "step": 5846 }, { "epoch": 0.3655, "grad_norm": 2.734375, "grad_norm_var": 0.03968098958333333, "learning_rate": 0.0001, "loss": 7.8933, "loss/crossentropy": 2.4630547761917114, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.25190863013267517, "step": 5848 }, { "epoch": 0.365625, "grad_norm": 2.59375, "grad_norm_var": 0.03762105305989583, "learning_rate": 0.0001, "loss": 7.5083, "loss/crossentropy": 2.397603750228882, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2426484450697899, "step": 5850 }, { "epoch": 0.36575, "grad_norm": 2.765625, "grad_norm_var": 0.03797098795572917, "learning_rate": 0.0001, "loss": 7.6795, "loss/crossentropy": 2.4046876430511475, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2435351312160492, "step": 5852 }, { "epoch": 0.365875, "grad_norm": 2.875, "grad_norm_var": 0.03642476399739583, "learning_rate": 0.0001, "loss": 7.8708, "loss/crossentropy": 2.261651039123535, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24050821363925934, "step": 5854 }, { "epoch": 0.366, "grad_norm": 2.75, "grad_norm_var": 0.035009765625, "learning_rate": 0.0001, "loss": 7.8325, "loss/crossentropy": 2.3034123182296753, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.234369657933712, "step": 5856 }, { "epoch": 0.366125, "grad_norm": 2.671875, "grad_norm_var": 0.023486328125, "learning_rate": 0.0001, "loss": 7.6876, "loss/crossentropy": 2.2053914070129395, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.218138687312603, "step": 5858 }, { "epoch": 0.36625, "grad_norm": 2.75, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 7.9156, "loss/crossentropy": 2.4461461305618286, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2389724925160408, "step": 5860 }, { "epoch": 0.366375, "grad_norm": 2.578125, "grad_norm_var": 0.022001139322916665, "learning_rate": 0.0001, "loss": 7.7191, "loss/crossentropy": 2.4519251585006714, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2351590022444725, "step": 5862 }, { "epoch": 0.3665, "grad_norm": 2.53125, "grad_norm_var": 0.023258463541666666, "learning_rate": 0.0001, "loss": 7.5346, "loss/crossentropy": 2.161966562271118, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.20742753893136978, "step": 5864 }, { "epoch": 0.366625, "grad_norm": 2.734375, "grad_norm_var": 0.022477213541666666, "learning_rate": 0.0001, "loss": 7.556, "loss/crossentropy": 2.255254030227661, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22080879658460617, "step": 5866 }, { "epoch": 0.36675, "grad_norm": 2.5, "grad_norm_var": 0.021728515625, "learning_rate": 0.0001, "loss": 7.64, "loss/crossentropy": 2.2866841554641724, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.234619602560997, "step": 5868 }, { "epoch": 0.366875, "grad_norm": 2.59375, "grad_norm_var": 0.0194732666015625, "learning_rate": 0.0001, "loss": 7.6179, "loss/crossentropy": 2.2062015533447266, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21438821405172348, "step": 5870 }, { "epoch": 0.367, "grad_norm": 2.6875, "grad_norm_var": 0.018952433268229166, "learning_rate": 0.0001, "loss": 7.661, "loss/crossentropy": 2.4743188619613647, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23314779251813889, "step": 5872 }, { "epoch": 0.367125, "grad_norm": 2.546875, "grad_norm_var": 0.01533203125, "learning_rate": 0.0001, "loss": 7.6302, "loss/crossentropy": 2.201040029525757, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22404490411281586, "step": 5874 }, { "epoch": 0.36725, "grad_norm": 2.46875, "grad_norm_var": 0.0072906494140625, "learning_rate": 0.0001, "loss": 7.2477, "loss/crossentropy": 2.14961576461792, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22370487451553345, "step": 5876 }, { "epoch": 0.367375, "grad_norm": 2.84375, "grad_norm_var": 0.01129150390625, "learning_rate": 0.0001, "loss": 7.6693, "loss/crossentropy": 2.0708529353141785, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21879161894321442, "step": 5878 }, { "epoch": 0.3675, "grad_norm": 2.671875, "grad_norm_var": 0.020197550455729168, "learning_rate": 0.0001, "loss": 7.8825, "loss/crossentropy": 2.5038328170776367, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2602665275335312, "step": 5880 }, { "epoch": 0.367625, "grad_norm": 2.5625, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 7.1971, "loss/crossentropy": 1.7745939493179321, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.20980294048786163, "step": 5882 }, { "epoch": 0.36775, "grad_norm": 2.796875, "grad_norm_var": 0.023046875, "learning_rate": 0.0001, "loss": 7.7931, "loss/crossentropy": 2.2252343893051147, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2432248890399933, "step": 5884 }, { "epoch": 0.367875, "grad_norm": 2.625, "grad_norm_var": 0.023021443684895834, "learning_rate": 0.0001, "loss": 7.8091, "loss/crossentropy": 2.213648796081543, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22556844353675842, "step": 5886 }, { "epoch": 0.368, "grad_norm": 2.71875, "grad_norm_var": 0.023942057291666666, "learning_rate": 0.0001, "loss": 7.7976, "loss/crossentropy": 2.4969310760498047, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25347311794757843, "step": 5888 }, { "epoch": 0.368125, "grad_norm": 2.8125, "grad_norm_var": 0.023128255208333334, "learning_rate": 0.0001, "loss": 7.7526, "loss/crossentropy": 2.3898565769195557, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2333284169435501, "step": 5890 }, { "epoch": 0.36825, "grad_norm": 2.6875, "grad_norm_var": 0.018876139322916666, "learning_rate": 0.0001, "loss": 7.737, "loss/crossentropy": 2.1193548440933228, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23113426566123962, "step": 5892 }, { "epoch": 0.368375, "grad_norm": 3.34375, "grad_norm_var": 0.07916259765625, "learning_rate": 0.0001, "loss": 7.6829, "loss/crossentropy": 2.2866013050079346, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22913432121276855, "step": 5894 }, { "epoch": 0.3685, "grad_norm": 2.75, "grad_norm_var": 0.07553609212239583, "learning_rate": 0.0001, "loss": 7.6661, "loss/crossentropy": 2.32522714138031, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23685137182474136, "step": 5896 }, { "epoch": 0.368625, "grad_norm": 2.546875, "grad_norm_var": 0.07281901041666666, "learning_rate": 0.0001, "loss": 7.6691, "loss/crossentropy": 2.48408305644989, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24672800302505493, "step": 5898 }, { "epoch": 0.36875, "grad_norm": 2.578125, "grad_norm_var": 0.07582906087239584, "learning_rate": 0.0001, "loss": 7.5296, "loss/crossentropy": 2.3540754318237305, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23204008489847183, "step": 5900 }, { "epoch": 0.368875, "grad_norm": 2.59375, "grad_norm_var": 0.076708984375, "learning_rate": 0.0001, "loss": 7.5615, "loss/crossentropy": 2.4670441150665283, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2386396825313568, "step": 5902 }, { "epoch": 0.369, "grad_norm": 2.53125, "grad_norm_var": 0.08369038899739584, "learning_rate": 0.0001, "loss": 7.2806, "loss/crossentropy": 1.9959867000579834, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2062307819724083, "step": 5904 }, { "epoch": 0.369125, "grad_norm": 2.65625, "grad_norm_var": 0.08377176920572917, "learning_rate": 0.0001, "loss": 7.7509, "loss/crossentropy": 2.2012252807617188, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23279276490211487, "step": 5906 }, { "epoch": 0.36925, "grad_norm": 2.5625, "grad_norm_var": 0.08495992024739583, "learning_rate": 0.0001, "loss": 7.3945, "loss/crossentropy": 2.222532093524933, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22606757283210754, "step": 5908 }, { "epoch": 0.369375, "grad_norm": 2.640625, "grad_norm_var": 0.020653279622395833, "learning_rate": 0.0001, "loss": 7.5244, "loss/crossentropy": 2.2612287998199463, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23983096331357956, "step": 5910 }, { "epoch": 0.3695, "grad_norm": 2.609375, "grad_norm_var": 0.017438761393229165, "learning_rate": 0.0001, "loss": 7.521, "loss/crossentropy": 2.4401954412460327, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.22699681669473648, "step": 5912 }, { "epoch": 0.369625, "grad_norm": 2.671875, "grad_norm_var": 0.017048136393229166, "learning_rate": 0.0001, "loss": 7.5927, "loss/crossentropy": 2.104931354522705, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22078970074653625, "step": 5914 }, { "epoch": 0.36975, "grad_norm": 2.453125, "grad_norm_var": 0.019266764322916668, "learning_rate": 0.0001, "loss": 7.5568, "loss/crossentropy": 2.251745820045471, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21398383378982544, "step": 5916 }, { "epoch": 0.369875, "grad_norm": 2.703125, "grad_norm_var": 0.0191802978515625, "learning_rate": 0.0001, "loss": 7.7468, "loss/crossentropy": 2.2169294357299805, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24049219489097595, "step": 5918 }, { "epoch": 0.37, "grad_norm": 2.703125, "grad_norm_var": 0.015160115559895833, "learning_rate": 0.0001, "loss": 7.8715, "loss/crossentropy": 2.460370659828186, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24377574771642685, "step": 5920 }, { "epoch": 0.370125, "grad_norm": 2.671875, "grad_norm_var": 0.0152740478515625, "learning_rate": 0.0001, "loss": 7.7745, "loss/crossentropy": 2.2527419328689575, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24440738558769226, "step": 5922 }, { "epoch": 0.37025, "grad_norm": 2.734375, "grad_norm_var": 0.014156087239583334, "learning_rate": 0.0001, "loss": 7.7353, "loss/crossentropy": 2.0664572715759277, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2222459763288498, "step": 5924 }, { "epoch": 0.370375, "grad_norm": 2.609375, "grad_norm_var": 0.004450480143229167, "learning_rate": 0.0001, "loss": 7.7481, "loss/crossentropy": 2.417236566543579, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2252212017774582, "step": 5926 }, { "epoch": 0.3705, "grad_norm": 2.546875, "grad_norm_var": 0.005492146809895833, "learning_rate": 0.0001, "loss": 7.7727, "loss/crossentropy": 2.1149237751960754, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22263246774673462, "step": 5928 }, { "epoch": 0.370625, "grad_norm": 2.765625, "grad_norm_var": 0.006624348958333333, "learning_rate": 0.0001, "loss": 7.6685, "loss/crossentropy": 2.2120312452316284, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23706556111574173, "step": 5930 }, { "epoch": 0.37075, "grad_norm": 2.59375, "grad_norm_var": 0.0063435872395833336, "learning_rate": 0.0001, "loss": 7.5919, "loss/crossentropy": 2.078566312789917, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22880972921848297, "step": 5932 }, { "epoch": 0.370875, "grad_norm": 2.71875, "grad_norm_var": 0.0082427978515625, "learning_rate": 0.0001, "loss": 7.4929, "loss/crossentropy": 2.3992310762405396, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24684636294841766, "step": 5934 }, { "epoch": 0.371, "grad_norm": 2.5625, "grad_norm_var": 0.008447265625, "learning_rate": 0.0001, "loss": 7.784, "loss/crossentropy": 2.466954231262207, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23078446835279465, "step": 5936 }, { "epoch": 0.371125, "grad_norm": 2.53125, "grad_norm_var": 0.00826416015625, "learning_rate": 0.0001, "loss": 7.7602, "loss/crossentropy": 2.3609282970428467, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2247038632631302, "step": 5938 }, { "epoch": 0.37125, "grad_norm": 2.640625, "grad_norm_var": 0.0078765869140625, "learning_rate": 0.0001, "loss": 7.535, "loss/crossentropy": 2.263818144798279, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2534177154302597, "step": 5940 }, { "epoch": 0.371375, "grad_norm": 2.75, "grad_norm_var": 0.009423828125, "learning_rate": 0.0001, "loss": 7.8238, "loss/crossentropy": 2.3857834339141846, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24828483909368515, "step": 5942 }, { "epoch": 0.3715, "grad_norm": 2.9375, "grad_norm_var": 0.01539306640625, "learning_rate": 0.0001, "loss": 7.6151, "loss/crossentropy": 2.17099666595459, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.246975839138031, "step": 5944 }, { "epoch": 0.371625, "grad_norm": 2.78125, "grad_norm_var": 0.015559895833333334, "learning_rate": 0.0001, "loss": 7.8775, "loss/crossentropy": 2.387460470199585, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23917070776224136, "step": 5946 }, { "epoch": 0.37175, "grad_norm": 2.671875, "grad_norm_var": 0.013960774739583333, "learning_rate": 0.0001, "loss": 7.567, "loss/crossentropy": 2.363463878631592, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23225895315408707, "step": 5948 }, { "epoch": 0.371875, "grad_norm": 2.90625, "grad_norm_var": 0.015462239583333334, "learning_rate": 0.0001, "loss": 7.6478, "loss/crossentropy": 2.110305368900299, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24910727888345718, "step": 5950 }, { "epoch": 0.372, "grad_norm": 3.15625, "grad_norm_var": 0.028837076822916665, "learning_rate": 0.0001, "loss": 7.3519, "loss/crossentropy": 2.092695653438568, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23405098170042038, "step": 5952 }, { "epoch": 0.372125, "grad_norm": 2.734375, "grad_norm_var": 0.025641886393229167, "learning_rate": 0.0001, "loss": 7.8816, "loss/crossentropy": 2.463305711746216, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2576056718826294, "step": 5954 }, { "epoch": 0.37225, "grad_norm": 2.90625, "grad_norm_var": 0.022001139322916665, "learning_rate": 0.0001, "loss": 7.7932, "loss/crossentropy": 2.3847213983535767, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2448548898100853, "step": 5956 }, { "epoch": 0.372375, "grad_norm": 2.8125, "grad_norm_var": 0.01842041015625, "learning_rate": 0.0001, "loss": 7.8203, "loss/crossentropy": 2.457032084465027, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23640016466379166, "step": 5958 }, { "epoch": 0.3725, "grad_norm": 2.640625, "grad_norm_var": 0.0214263916015625, "learning_rate": 0.0001, "loss": 7.6963, "loss/crossentropy": 2.246446132659912, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22195183485746384, "step": 5960 }, { "epoch": 0.372625, "grad_norm": 2.515625, "grad_norm_var": 0.027652994791666666, "learning_rate": 0.0001, "loss": 7.5616, "loss/crossentropy": 2.255565047264099, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23074719309806824, "step": 5962 }, { "epoch": 0.37275, "grad_norm": 2.8125, "grad_norm_var": 0.031004842122395834, "learning_rate": 0.0001, "loss": 7.651, "loss/crossentropy": 2.2640656232833862, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22042087465524673, "step": 5964 }, { "epoch": 0.372875, "grad_norm": 2.609375, "grad_norm_var": 0.029671223958333333, "learning_rate": 0.0001, "loss": 7.3665, "loss/crossentropy": 2.188614010810852, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22926143556833267, "step": 5966 }, { "epoch": 0.373, "grad_norm": 2.734375, "grad_norm_var": 0.015999348958333333, "learning_rate": 0.0001, "loss": 7.8119, "loss/crossentropy": 2.408371925354004, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24314987659454346, "step": 5968 }, { "epoch": 0.373125, "grad_norm": 2.484375, "grad_norm_var": 0.018343098958333335, "learning_rate": 0.0001, "loss": 7.6113, "loss/crossentropy": 2.227108359336853, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2167133092880249, "step": 5970 }, { "epoch": 0.37325, "grad_norm": 2.734375, "grad_norm_var": 0.013337198893229167, "learning_rate": 0.0001, "loss": 7.4463, "loss/crossentropy": 2.2483644485473633, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2302241176366806, "step": 5972 }, { "epoch": 0.373375, "grad_norm": 2.578125, "grad_norm_var": 0.012202962239583334, "learning_rate": 0.0001, "loss": 7.7015, "loss/crossentropy": 2.3281365633010864, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23920684307813644, "step": 5974 }, { "epoch": 0.3735, "grad_norm": 2.59375, "grad_norm_var": 0.011913045247395834, "learning_rate": 0.0001, "loss": 7.7932, "loss/crossentropy": 2.5587610006332397, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23727624118328094, "step": 5976 }, { "epoch": 0.373625, "grad_norm": 2.609375, "grad_norm_var": 0.0097564697265625, "learning_rate": 0.0001, "loss": 7.7395, "loss/crossentropy": 2.4816921949386597, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22629008442163467, "step": 5978 }, { "epoch": 0.37375, "grad_norm": 2.859375, "grad_norm_var": 0.2471832275390625, "learning_rate": 0.0001, "loss": 7.8407, "loss/crossentropy": 2.337537169456482, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2310085967183113, "step": 5980 }, { "epoch": 0.373875, "grad_norm": 2.609375, "grad_norm_var": 0.24763081868489584, "learning_rate": 0.0001, "loss": 7.5726, "loss/crossentropy": 2.1069902181625366, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22254319489002228, "step": 5982 }, { "epoch": 0.374, "grad_norm": 2.71875, "grad_norm_var": 0.25110270182291666, "learning_rate": 0.0001, "loss": 7.7051, "loss/crossentropy": 2.3107383251190186, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24894680082798004, "step": 5984 }, { "epoch": 0.374125, "grad_norm": 2.46875, "grad_norm_var": 0.24958394368489584, "learning_rate": 0.0001, "loss": 7.4846, "loss/crossentropy": 2.199355721473694, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21842436492443085, "step": 5986 }, { "epoch": 0.37425, "grad_norm": 2.546875, "grad_norm_var": 0.2533111572265625, "learning_rate": 0.0001, "loss": 7.7508, "loss/crossentropy": 2.137286603450775, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21731649339199066, "step": 5988 }, { "epoch": 0.374375, "grad_norm": 3.03125, "grad_norm_var": 0.2579986572265625, "learning_rate": 0.0001, "loss": 7.7678, "loss/crossentropy": 2.62995445728302, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24149125814437866, "step": 5990 }, { "epoch": 0.3745, "grad_norm": 2.65625, "grad_norm_var": 0.25680338541666664, "learning_rate": 0.0001, "loss": 7.7294, "loss/crossentropy": 2.4583733081817627, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.21904631704092026, "step": 5992 }, { "epoch": 0.374625, "grad_norm": 2.5625, "grad_norm_var": 0.25738932291666666, "learning_rate": 0.0001, "loss": 7.8577, "loss/crossentropy": 2.449798107147217, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2651425004005432, "step": 5994 }, { "epoch": 0.37475, "grad_norm": 2.796875, "grad_norm_var": 0.019498697916666665, "learning_rate": 0.0001, "loss": 7.4875, "loss/crossentropy": 2.3657705783843994, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23899579793214798, "step": 5996 }, { "epoch": 0.374875, "grad_norm": 2.9375, "grad_norm_var": 0.024540201822916666, "learning_rate": 0.0001, "loss": 7.8128, "loss/crossentropy": 2.4570928812026978, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24305948615074158, "step": 5998 }, { "epoch": 0.375, "grad_norm": 2.765625, "grad_norm_var": 0.024214680989583334, "learning_rate": 0.0001, "loss": 7.7806, "loss/crossentropy": 2.352820634841919, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2659597098827362, "step": 6000 }, { "epoch": 0.375125, "grad_norm": 2.75, "grad_norm_var": 0.02154541015625, "learning_rate": 0.0001, "loss": 7.8043, "loss/crossentropy": 2.408151626586914, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2393057569861412, "step": 6002 }, { "epoch": 0.37525, "grad_norm": 2.453125, "grad_norm_var": 0.02392578125, "learning_rate": 0.0001, "loss": 7.6209, "loss/crossentropy": 2.2155799865722656, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23778663575649261, "step": 6004 }, { "epoch": 0.375375, "grad_norm": 2.71875, "grad_norm_var": 0.044188435872395834, "learning_rate": 0.0001, "loss": 7.6671, "loss/crossentropy": 2.308230757713318, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2341061532497406, "step": 6006 }, { "epoch": 0.3755, "grad_norm": 2.6875, "grad_norm_var": 0.04554036458333333, "learning_rate": 0.0001, "loss": 7.5508, "loss/crossentropy": 2.2497153282165527, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22864826768636703, "step": 6008 }, { "epoch": 0.375625, "grad_norm": 2.65625, "grad_norm_var": 0.047684733072916666, "learning_rate": 0.0001, "loss": 7.8659, "loss/crossentropy": 2.3210668563842773, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23793821036815643, "step": 6010 }, { "epoch": 0.37575, "grad_norm": 2.859375, "grad_norm_var": 0.042313639322916666, "learning_rate": 0.0001, "loss": 7.4668, "loss/crossentropy": 2.1286468505859375, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23601098358631134, "step": 6012 }, { "epoch": 0.375875, "grad_norm": 2.484375, "grad_norm_var": 0.053059895833333336, "learning_rate": 0.0001, "loss": 7.4417, "loss/crossentropy": 2.0199838280677795, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23181351274251938, "step": 6014 }, { "epoch": 0.376, "grad_norm": 2.765625, "grad_norm_var": 0.053059895833333336, "learning_rate": 0.0001, "loss": 7.8015, "loss/crossentropy": 2.3172006607055664, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24773040413856506, "step": 6016 }, { "epoch": 0.376125, "grad_norm": 2.84375, "grad_norm_var": 0.05366109212239583, "learning_rate": 0.0001, "loss": 7.8316, "loss/crossentropy": 2.342870354652405, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.23780003190040588, "step": 6018 }, { "epoch": 0.37625, "grad_norm": 2.796875, "grad_norm_var": 0.07469075520833333, "learning_rate": 0.0001, "loss": 7.7755, "loss/crossentropy": 2.311954140663147, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2704939991235733, "step": 6020 }, { "epoch": 0.376375, "grad_norm": 2.515625, "grad_norm_var": 0.05579020182291667, "learning_rate": 0.0001, "loss": 7.5255, "loss/crossentropy": 1.9952068328857422, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21815897524356842, "step": 6022 }, { "epoch": 0.3765, "grad_norm": 2.625, "grad_norm_var": 0.05507710774739583, "learning_rate": 0.0001, "loss": 7.6385, "loss/crossentropy": 2.256587505340576, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2353539764881134, "step": 6024 }, { "epoch": 0.376625, "grad_norm": 2.65625, "grad_norm_var": 0.06562093098958334, "learning_rate": 0.0001, "loss": 7.9058, "loss/crossentropy": 2.3976510763168335, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25791145861148834, "step": 6026 }, { "epoch": 0.37675, "grad_norm": 2.515625, "grad_norm_var": 0.07339579264322917, "learning_rate": 0.0001, "loss": 7.6396, "loss/crossentropy": 2.2685035467147827, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22524115443229675, "step": 6028 }, { "epoch": 0.376875, "grad_norm": 3.0, "grad_norm_var": 0.06744384765625, "learning_rate": 0.0001, "loss": 7.6482, "loss/crossentropy": 2.3107270002365112, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23320303112268448, "step": 6030 }, { "epoch": 0.377, "grad_norm": 3.015625, "grad_norm_var": 0.07727457682291666, "learning_rate": 0.0001, "loss": 7.2436, "loss/crossentropy": 2.0996251106262207, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.20807231962680817, "step": 6032 }, { "epoch": 0.377125, "grad_norm": 2.65625, "grad_norm_var": 0.07924702962239584, "learning_rate": 0.0001, "loss": 7.6094, "loss/crossentropy": 2.23851215839386, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23858527839183807, "step": 6034 }, { "epoch": 0.37725, "grad_norm": 2.671875, "grad_norm_var": 0.052718098958333334, "learning_rate": 0.0001, "loss": 7.6318, "loss/crossentropy": 2.1331793665885925, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23166512697935104, "step": 6036 }, { "epoch": 0.377375, "grad_norm": 2.78125, "grad_norm_var": 0.0510650634765625, "learning_rate": 0.0001, "loss": 7.735, "loss/crossentropy": 2.363166332244873, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23458106815814972, "step": 6038 }, { "epoch": 0.3775, "grad_norm": 2.671875, "grad_norm_var": 0.0511627197265625, "learning_rate": 0.0001, "loss": 7.7243, "loss/crossentropy": 2.310961365699768, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23389450460672379, "step": 6040 }, { "epoch": 0.377625, "grad_norm": 2.71875, "grad_norm_var": 0.0395660400390625, "learning_rate": 0.0001, "loss": 7.7804, "loss/crossentropy": 2.4806370735168457, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24192671477794647, "step": 6042 }, { "epoch": 0.37775, "grad_norm": 2.765625, "grad_norm_var": 0.0334625244140625, "learning_rate": 0.0001, "loss": 7.31, "loss/crossentropy": 2.1615121960639954, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.20999637246131897, "step": 6044 }, { "epoch": 0.377875, "grad_norm": 2.59375, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 8.0382, "loss/crossentropy": 2.563165068626404, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24339266121387482, "step": 6046 }, { "epoch": 0.378, "grad_norm": 2.671875, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 7.6975, "loss/crossentropy": 2.27353572845459, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21522746980190277, "step": 6048 }, { "epoch": 0.378125, "grad_norm": 2.578125, "grad_norm_var": 0.0158599853515625, "learning_rate": 0.0001, "loss": 7.5492, "loss/crossentropy": 2.212855100631714, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24849861860275269, "step": 6050 }, { "epoch": 0.37825, "grad_norm": 2.5625, "grad_norm_var": 0.014452107747395833, "learning_rate": 0.0001, "loss": 7.6812, "loss/crossentropy": 2.3381168842315674, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22568334639072418, "step": 6052 }, { "epoch": 0.378375, "grad_norm": 2.46875, "grad_norm_var": 0.014058430989583334, "learning_rate": 0.0001, "loss": 7.442, "loss/crossentropy": 2.228323459625244, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24169079214334488, "step": 6054 }, { "epoch": 0.3785, "grad_norm": 2.71875, "grad_norm_var": 0.012630208333333334, "learning_rate": 0.0001, "loss": 7.6411, "loss/crossentropy": 2.322105050086975, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23203061521053314, "step": 6056 }, { "epoch": 0.378625, "grad_norm": 2.640625, "grad_norm_var": 0.011551920572916667, "learning_rate": 0.0001, "loss": 7.7224, "loss/crossentropy": 2.159703016281128, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.21591179072856903, "step": 6058 }, { "epoch": 0.37875, "grad_norm": 3.015625, "grad_norm_var": 0.024144490559895832, "learning_rate": 0.0001, "loss": 7.7152, "loss/crossentropy": 2.2216445207595825, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24340912699699402, "step": 6060 }, { "epoch": 0.378875, "grad_norm": 2.421875, "grad_norm_var": 0.03134765625, "learning_rate": 0.0001, "loss": 7.4533, "loss/crossentropy": 2.086085796356201, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2150716930627823, "step": 6062 }, { "epoch": 0.379, "grad_norm": 2.578125, "grad_norm_var": 0.0327301025390625, "learning_rate": 0.0001, "loss": 7.6244, "loss/crossentropy": 2.382838010787964, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24344154447317123, "step": 6064 }, { "epoch": 0.379125, "grad_norm": 3.171875, "grad_norm_var": 0.0528961181640625, "learning_rate": 0.0001, "loss": 8.0578, "loss/crossentropy": 2.5265146493911743, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.27480366826057434, "step": 6066 }, { "epoch": 0.37925, "grad_norm": 2.46875, "grad_norm_var": 0.0550689697265625, "learning_rate": 0.0001, "loss": 7.5784, "loss/crossentropy": 2.1019030809402466, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2160928100347519, "step": 6068 }, { "epoch": 0.379375, "grad_norm": 2.671875, "grad_norm_var": 0.05156148274739583, "learning_rate": 0.0001, "loss": 7.7018, "loss/crossentropy": 2.1890240907669067, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23353660106658936, "step": 6070 }, { "epoch": 0.3795, "grad_norm": 2.59375, "grad_norm_var": 0.05442606608072917, "learning_rate": 0.0001, "loss": 7.754, "loss/crossentropy": 2.3444074392318726, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23902088403701782, "step": 6072 }, { "epoch": 0.379625, "grad_norm": 2.640625, "grad_norm_var": 0.054361979166666664, "learning_rate": 0.0001, "loss": 7.5917, "loss/crossentropy": 2.328035831451416, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23949383199214935, "step": 6074 }, { "epoch": 0.37975, "grad_norm": 2.453125, "grad_norm_var": 0.04258524576822917, "learning_rate": 0.0001, "loss": 7.5871, "loss/crossentropy": 2.4384878873825073, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21975906193256378, "step": 6076 }, { "epoch": 0.379875, "grad_norm": 2.546875, "grad_norm_var": 0.0364654541015625, "learning_rate": 0.0001, "loss": 7.7222, "loss/crossentropy": 2.444751262664795, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24029017239809036, "step": 6078 }, { "epoch": 0.38, "grad_norm": 2.75, "grad_norm_var": 0.03294169108072917, "learning_rate": 0.0001, "loss": 7.4555, "loss/crossentropy": 2.2348215579986572, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23351821303367615, "step": 6080 }, { "epoch": 0.380125, "grad_norm": 2.6875, "grad_norm_var": 0.013081868489583334, "learning_rate": 0.0001, "loss": 7.474, "loss/crossentropy": 2.3435142040252686, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22855989634990692, "step": 6082 }, { "epoch": 0.38025, "grad_norm": 2.734375, "grad_norm_var": 0.015087890625, "learning_rate": 0.0001, "loss": 7.6719, "loss/crossentropy": 2.2072794437408447, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24361062049865723, "step": 6084 }, { "epoch": 0.380375, "grad_norm": 2.640625, "grad_norm_var": 0.014579264322916667, "learning_rate": 0.0001, "loss": 7.6465, "loss/crossentropy": 2.4240407943725586, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22966670244932175, "step": 6086 }, { "epoch": 0.3805, "grad_norm": 2.578125, "grad_norm_var": 0.012984212239583333, "learning_rate": 0.0001, "loss": 7.6794, "loss/crossentropy": 2.371833920478821, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24130288511514664, "step": 6088 }, { "epoch": 0.380625, "grad_norm": 2.78125, "grad_norm_var": 0.014241536458333334, "learning_rate": 0.0001, "loss": 7.6707, "loss/crossentropy": 2.3284101486206055, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24086137861013412, "step": 6090 }, { "epoch": 0.38075, "grad_norm": 2.546875, "grad_norm_var": 0.012565104166666667, "learning_rate": 0.0001, "loss": 7.6246, "loss/crossentropy": 2.2990914583206177, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.21879705786705017, "step": 6092 }, { "epoch": 0.380875, "grad_norm": 2.75, "grad_norm_var": 0.0132232666015625, "learning_rate": 0.0001, "loss": 7.598, "loss/crossentropy": 2.198532819747925, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2263047844171524, "step": 6094 }, { "epoch": 0.381, "grad_norm": 2.75, "grad_norm_var": 0.0154937744140625, "learning_rate": 0.0001, "loss": 7.6286, "loss/crossentropy": 2.2248634099960327, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2126716747879982, "step": 6096 }, { "epoch": 0.381125, "grad_norm": 2.65625, "grad_norm_var": 0.014435831705729167, "learning_rate": 0.0001, "loss": 7.4346, "loss/crossentropy": 2.298315405845642, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2152308076620102, "step": 6098 }, { "epoch": 0.38125, "grad_norm": 2.546875, "grad_norm_var": 0.013395182291666667, "learning_rate": 0.0001, "loss": 7.626, "loss/crossentropy": 2.324112296104431, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22443776577711105, "step": 6100 }, { "epoch": 0.381375, "grad_norm": 2.71875, "grad_norm_var": 0.016258748372395833, "learning_rate": 0.0001, "loss": 7.5771, "loss/crossentropy": 2.13493949174881, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23600095510482788, "step": 6102 }, { "epoch": 0.3815, "grad_norm": 2.765625, "grad_norm_var": 0.017731730143229166, "learning_rate": 0.0001, "loss": 7.6666, "loss/crossentropy": 2.2011935710906982, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22972387075424194, "step": 6104 }, { "epoch": 0.381625, "grad_norm": 2.53125, "grad_norm_var": 0.017308553059895832, "learning_rate": 0.0001, "loss": 7.5621, "loss/crossentropy": 2.26337468624115, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22819294035434723, "step": 6106 }, { "epoch": 0.38175, "grad_norm": 2.640625, "grad_norm_var": 0.01676025390625, "learning_rate": 0.0001, "loss": 7.8308, "loss/crossentropy": 2.4758858680725098, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23562601953744888, "step": 6108 }, { "epoch": 0.381875, "grad_norm": 2.703125, "grad_norm_var": 0.0152740478515625, "learning_rate": 0.0001, "loss": 7.6868, "loss/crossentropy": 2.291264772415161, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24704581499099731, "step": 6110 }, { "epoch": 0.382, "grad_norm": 2.859375, "grad_norm_var": 0.0158355712890625, "learning_rate": 0.0001, "loss": 7.588, "loss/crossentropy": 2.072813391685486, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21798792481422424, "step": 6112 }, { "epoch": 0.382125, "grad_norm": 2.640625, "grad_norm_var": 0.0165191650390625, "learning_rate": 0.0001, "loss": 7.5059, "loss/crossentropy": 2.1852896213531494, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22181613743305206, "step": 6114 }, { "epoch": 0.38225, "grad_norm": 2.640625, "grad_norm_var": 0.011328125, "learning_rate": 0.0001, "loss": 7.8351, "loss/crossentropy": 2.4135853052139282, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2334243357181549, "step": 6116 }, { "epoch": 0.382375, "grad_norm": 2.609375, "grad_norm_var": 0.01025390625, "learning_rate": 0.0001, "loss": 7.7193, "loss/crossentropy": 2.178066372871399, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24784520268440247, "step": 6118 }, { "epoch": 0.3825, "grad_norm": 2.65625, "grad_norm_var": 0.008610026041666666, "learning_rate": 0.0001, "loss": 7.5475, "loss/crossentropy": 2.428821086883545, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2478656992316246, "step": 6120 }, { "epoch": 0.382625, "grad_norm": 2.703125, "grad_norm_var": 0.0080963134765625, "learning_rate": 0.0001, "loss": 7.4552, "loss/crossentropy": 2.2502583265304565, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2202816605567932, "step": 6122 }, { "epoch": 0.38275, "grad_norm": 2.609375, "grad_norm_var": 0.0099273681640625, "learning_rate": 0.0001, "loss": 7.565, "loss/crossentropy": 2.2401230335235596, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22656769305467606, "step": 6124 }, { "epoch": 0.382875, "grad_norm": 2.828125, "grad_norm_var": 0.012528483072916667, "learning_rate": 0.0001, "loss": 7.7736, "loss/crossentropy": 2.2984206676483154, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2202964723110199, "step": 6126 }, { "epoch": 0.383, "grad_norm": 2.640625, "grad_norm_var": 0.009455362955729166, "learning_rate": 0.0001, "loss": 7.5844, "loss/crossentropy": 2.5174983739852905, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23828734457492828, "step": 6128 }, { "epoch": 0.383125, "grad_norm": 2.765625, "grad_norm_var": 0.14993082682291667, "learning_rate": 0.0001, "loss": 7.6639, "loss/crossentropy": 2.463420033454895, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23308632522821426, "step": 6130 }, { "epoch": 0.38325, "grad_norm": 2.890625, "grad_norm_var": 0.14664306640625, "learning_rate": 0.0001, "loss": 7.5577, "loss/crossentropy": 2.23104989528656, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2579338401556015, "step": 6132 }, { "epoch": 0.383375, "grad_norm": 3.296875, "grad_norm_var": 0.15752665201822916, "learning_rate": 0.0001, "loss": 7.5671, "loss/crossentropy": 2.1324918270111084, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24405094981193542, "step": 6134 }, { "epoch": 0.3835, "grad_norm": 2.484375, "grad_norm_var": 0.1662017822265625, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 2.189239740371704, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2443932741880417, "step": 6136 }, { "epoch": 0.383625, "grad_norm": 2.71875, "grad_norm_var": 0.16852213541666666, "learning_rate": 0.0001, "loss": 7.4576, "loss/crossentropy": 2.124216377735138, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24696872383356094, "step": 6138 }, { "epoch": 0.38375, "grad_norm": 2.59375, "grad_norm_var": 0.16896158854166668, "learning_rate": 0.0001, "loss": 7.5008, "loss/crossentropy": 2.2583760023117065, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21510569751262665, "step": 6140 }, { "epoch": 0.383875, "grad_norm": 2.515625, "grad_norm_var": 0.17273763020833333, "learning_rate": 0.0001, "loss": 7.587, "loss/crossentropy": 2.106873631477356, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22265609353780746, "step": 6142 }, { "epoch": 0.384, "grad_norm": 2.609375, "grad_norm_var": 0.171044921875, "learning_rate": 0.0001, "loss": 7.6011, "loss/crossentropy": 2.3995012044906616, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23941193521022797, "step": 6144 }, { "epoch": 0.384125, "grad_norm": 2.703125, "grad_norm_var": 0.0429107666015625, "learning_rate": 0.0001, "loss": 7.7144, "loss/crossentropy": 2.393890619277954, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23992367833852768, "step": 6146 }, { "epoch": 0.38425, "grad_norm": 2.609375, "grad_norm_var": 0.04112040201822917, "learning_rate": 0.0001, "loss": 7.3439, "loss/crossentropy": 2.3042690753936768, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24251222610473633, "step": 6148 }, { "epoch": 0.384375, "grad_norm": 2.453125, "grad_norm_var": 0.012109375, "learning_rate": 0.0001, "loss": 7.5584, "loss/crossentropy": 2.3541401624679565, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22648414969444275, "step": 6150 }, { "epoch": 0.3845, "grad_norm": 2.515625, "grad_norm_var": 0.011637369791666666, "learning_rate": 0.0001, "loss": 7.435, "loss/crossentropy": 2.2438244819641113, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22370865941047668, "step": 6152 }, { "epoch": 0.384625, "grad_norm": 2.65625, "grad_norm_var": 0.0107421875, "learning_rate": 0.0001, "loss": 7.6937, "loss/crossentropy": 2.1709959506988525, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2162986770272255, "step": 6154 }, { "epoch": 0.38475, "grad_norm": 2.734375, "grad_norm_var": 0.013472493489583333, "learning_rate": 0.0001, "loss": 7.6166, "loss/crossentropy": 2.0791839361190796, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.21749362349510193, "step": 6156 }, { "epoch": 0.384875, "grad_norm": 2.625, "grad_norm_var": 0.012800089518229167, "learning_rate": 0.0001, "loss": 7.6503, "loss/crossentropy": 2.491228938102722, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22725120931863785, "step": 6158 }, { "epoch": 0.385, "grad_norm": 4.125, "grad_norm_var": 0.14875895182291668, "learning_rate": 0.0001, "loss": 7.801, "loss/crossentropy": 2.2165181636810303, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26578599214553833, "step": 6160 }, { "epoch": 0.385125, "grad_norm": 2.75, "grad_norm_var": 0.14739176432291667, "learning_rate": 0.0001, "loss": 7.6733, "loss/crossentropy": 2.44809627532959, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24047067761421204, "step": 6162 }, { "epoch": 0.38525, "grad_norm": 2.4375, "grad_norm_var": 0.15666402180989583, "learning_rate": 0.0001, "loss": 7.3456, "loss/crossentropy": 2.2379469871520996, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24312739074230194, "step": 6164 }, { "epoch": 0.385375, "grad_norm": 2.703125, "grad_norm_var": 0.14886067708333334, "learning_rate": 0.0001, "loss": 7.742, "loss/crossentropy": 2.399543285369873, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24021124839782715, "step": 6166 }, { "epoch": 0.3855, "grad_norm": 3.625, "grad_norm_var": 0.18679911295572918, "learning_rate": 0.0001, "loss": 7.7093, "loss/crossentropy": 2.2281036376953125, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22720475494861603, "step": 6168 }, { "epoch": 0.385625, "grad_norm": 2.703125, "grad_norm_var": 0.18660380045572916, "learning_rate": 0.0001, "loss": 7.664, "loss/crossentropy": 2.2964099645614624, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.21180368959903717, "step": 6170 }, { "epoch": 0.38575, "grad_norm": 2.828125, "grad_norm_var": 0.18655192057291667, "learning_rate": 0.0001, "loss": 7.4629, "loss/crossentropy": 2.0407859086990356, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21141932904720306, "step": 6172 }, { "epoch": 0.385875, "grad_norm": 2.59375, "grad_norm_var": 0.18731180826822916, "learning_rate": 0.0001, "loss": 7.5633, "loss/crossentropy": 2.3801496028900146, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2529350593686104, "step": 6174 }, { "epoch": 0.386, "grad_norm": 2.53125, "grad_norm_var": 0.07163798014322917, "learning_rate": 0.0001, "loss": 7.4573, "loss/crossentropy": 2.1455390453338623, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22570893168449402, "step": 6176 }, { "epoch": 0.386125, "grad_norm": 2.671875, "grad_norm_var": 0.06965738932291667, "learning_rate": 0.0001, "loss": 7.6168, "loss/crossentropy": 2.168286085128784, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2408534362912178, "step": 6178 }, { "epoch": 0.38625, "grad_norm": 2.4375, "grad_norm_var": 0.06897786458333334, "learning_rate": 0.0001, "loss": 7.5626, "loss/crossentropy": 2.310054659843445, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2263261079788208, "step": 6180 }, { "epoch": 0.386375, "grad_norm": 2.484375, "grad_norm_var": 0.07023824055989583, "learning_rate": 0.0001, "loss": 7.7042, "loss/crossentropy": 2.683452010154724, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25954148173332214, "step": 6182 }, { "epoch": 0.3865, "grad_norm": 2.59375, "grad_norm_var": 0.011181640625, "learning_rate": 0.0001, "loss": 7.7742, "loss/crossentropy": 2.572392463684082, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2342599406838417, "step": 6184 }, { "epoch": 0.386625, "grad_norm": 2.59375, "grad_norm_var": 0.011717732747395833, "learning_rate": 0.0001, "loss": 7.6304, "loss/crossentropy": 2.2197105884552, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22210820019245148, "step": 6186 }, { "epoch": 0.38675, "grad_norm": 2.578125, "grad_norm_var": 0.0074045817057291664, "learning_rate": 0.0001, "loss": 7.4809, "loss/crossentropy": 2.3251274824142456, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22822305560112, "step": 6188 }, { "epoch": 0.386875, "grad_norm": 2.703125, "grad_norm_var": 0.0071451822916666664, "learning_rate": 0.0001, "loss": 7.5574, "loss/crossentropy": 2.360219120979309, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23756802082061768, "step": 6190 }, { "epoch": 0.387, "grad_norm": 2.671875, "grad_norm_var": 0.0069163004557291664, "learning_rate": 0.0001, "loss": 7.4855, "loss/crossentropy": 2.3355443477630615, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.22813613712787628, "step": 6192 }, { "epoch": 0.387125, "grad_norm": 2.6875, "grad_norm_var": 0.01890869140625, "learning_rate": 0.0001, "loss": 7.5354, "loss/crossentropy": 2.217429995536804, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22317424416542053, "step": 6194 }, { "epoch": 0.38725, "grad_norm": 2.5625, "grad_norm_var": 0.017041015625, "learning_rate": 0.0001, "loss": 7.7757, "loss/crossentropy": 2.2990550994873047, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2334911972284317, "step": 6196 }, { "epoch": 0.387375, "grad_norm": 2.4375, "grad_norm_var": 0.0178131103515625, "learning_rate": 0.0001, "loss": 7.5928, "loss/crossentropy": 2.358139157295227, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24143555015325546, "step": 6198 }, { "epoch": 0.3875, "grad_norm": 3.21875, "grad_norm_var": 0.04187825520833333, "learning_rate": 0.0001, "loss": 7.258, "loss/crossentropy": 2.3413894176483154, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23546989262104034, "step": 6200 }, { "epoch": 0.387625, "grad_norm": 2.71875, "grad_norm_var": 0.045563761393229166, "learning_rate": 0.0001, "loss": 7.689, "loss/crossentropy": 2.4163408279418945, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23429743945598602, "step": 6202 }, { "epoch": 0.38775, "grad_norm": 2.625, "grad_norm_var": 0.04566141764322917, "learning_rate": 0.0001, "loss": 7.5468, "loss/crossentropy": 2.3620532751083374, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2505366653203964, "step": 6204 }, { "epoch": 0.387875, "grad_norm": 3.59375, "grad_norm_var": 0.27834879557291664, "learning_rate": 0.0001, "loss": 7.7757, "loss/crossentropy": 2.4979430437088013, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24882471561431885, "step": 6206 }, { "epoch": 0.388, "grad_norm": 2.953125, "grad_norm_var": 0.26658528645833335, "learning_rate": 0.0001, "loss": 7.7603, "loss/crossentropy": 2.268473744392395, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23721231520175934, "step": 6208 }, { "epoch": 0.388125, "grad_norm": 2.75, "grad_norm_var": 0.2640126546223958, "learning_rate": 0.0001, "loss": 7.8204, "loss/crossentropy": 2.505064606666565, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2627449035644531, "step": 6210 }, { "epoch": 0.38825, "grad_norm": 2.859375, "grad_norm_var": 0.2584136962890625, "learning_rate": 0.0001, "loss": 7.6849, "loss/crossentropy": 2.3508905172348022, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23537440598011017, "step": 6212 }, { "epoch": 0.388375, "grad_norm": 2.71875, "grad_norm_var": 0.2378326416015625, "learning_rate": 0.0001, "loss": 7.9688, "loss/crossentropy": 2.4239206314086914, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2536824867129326, "step": 6214 }, { "epoch": 0.3885, "grad_norm": 2.578125, "grad_norm_var": 0.22952372233072918, "learning_rate": 0.0001, "loss": 7.6052, "loss/crossentropy": 2.1807726621627808, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23037172853946686, "step": 6216 }, { "epoch": 0.388625, "grad_norm": 2.546875, "grad_norm_var": 0.23964436848958334, "learning_rate": 0.0001, "loss": 7.5335, "loss/crossentropy": 2.069059729576111, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21161602437496185, "step": 6218 }, { "epoch": 0.38875, "grad_norm": 2.84375, "grad_norm_var": 0.22766011555989582, "learning_rate": 0.0001, "loss": 7.5032, "loss/crossentropy": 2.0730834007263184, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23045386373996735, "step": 6220 }, { "epoch": 0.388875, "grad_norm": 2.5625, "grad_norm_var": 0.020048014322916665, "learning_rate": 0.0001, "loss": 7.6198, "loss/crossentropy": 2.277284026145935, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.229339100420475, "step": 6222 }, { "epoch": 0.389, "grad_norm": 2.859375, "grad_norm_var": 0.0202789306640625, "learning_rate": 0.0001, "loss": 7.5042, "loss/crossentropy": 2.4536852836608887, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22547976672649384, "step": 6224 }, { "epoch": 0.389125, "grad_norm": 2.390625, "grad_norm_var": 0.024283854166666667, "learning_rate": 0.0001, "loss": 7.5769, "loss/crossentropy": 2.1632683277130127, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21542678773403168, "step": 6226 }, { "epoch": 0.38925, "grad_norm": 2.484375, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 7.6292, "loss/crossentropy": 2.2627917528152466, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23226077109575272, "step": 6228 }, { "epoch": 0.389375, "grad_norm": 2.78125, "grad_norm_var": 0.022526041666666666, "learning_rate": 0.0001, "loss": 7.6491, "loss/crossentropy": 2.3328146934509277, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22146063297986984, "step": 6230 }, { "epoch": 0.3895, "grad_norm": 2.59375, "grad_norm_var": 0.0231109619140625, "learning_rate": 0.0001, "loss": 7.6946, "loss/crossentropy": 2.2065939903259277, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2365259975194931, "step": 6232 }, { "epoch": 0.389625, "grad_norm": 2.75, "grad_norm_var": 0.02880859375, "learning_rate": 0.0001, "loss": 7.5464, "loss/crossentropy": 2.105452299118042, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22692933678627014, "step": 6234 }, { "epoch": 0.38975, "grad_norm": 2.8125, "grad_norm_var": 0.02720947265625, "learning_rate": 0.0001, "loss": 7.6207, "loss/crossentropy": 2.297482490539551, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22834856808185577, "step": 6236 }, { "epoch": 0.389875, "grad_norm": 2.65625, "grad_norm_var": 0.023714192708333335, "learning_rate": 0.0001, "loss": 7.5103, "loss/crossentropy": 2.407977342605591, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23379070311784744, "step": 6238 }, { "epoch": 0.39, "grad_norm": 2.484375, "grad_norm_var": 0.020213826497395834, "learning_rate": 0.0001, "loss": 7.5604, "loss/crossentropy": 2.083792746067047, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2547670975327492, "step": 6240 }, { "epoch": 0.390125, "grad_norm": 2.65625, "grad_norm_var": 0.015086873372395834, "learning_rate": 0.0001, "loss": 7.6132, "loss/crossentropy": 2.289841413497925, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.21859893202781677, "step": 6242 }, { "epoch": 0.39025, "grad_norm": 2.53125, "grad_norm_var": 0.017867024739583334, "learning_rate": 0.0001, "loss": 7.6348, "loss/crossentropy": 2.5246392488479614, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22999003529548645, "step": 6244 }, { "epoch": 0.390375, "grad_norm": 2.484375, "grad_norm_var": 0.019905598958333333, "learning_rate": 0.0001, "loss": 7.5779, "loss/crossentropy": 2.1156492829322815, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21879246830940247, "step": 6246 }, { "epoch": 0.3905, "grad_norm": 2.578125, "grad_norm_var": 0.020377604166666667, "learning_rate": 0.0001, "loss": 7.6302, "loss/crossentropy": 2.283422589302063, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22403715550899506, "step": 6248 }, { "epoch": 0.390625, "grad_norm": 2.640625, "grad_norm_var": 0.017341105143229167, "learning_rate": 0.0001, "loss": 7.5602, "loss/crossentropy": 2.176552951335907, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23395199328660965, "step": 6250 }, { "epoch": 0.39075, "grad_norm": 2.46875, "grad_norm_var": 0.015755208333333333, "learning_rate": 0.0001, "loss": 7.7118, "loss/crossentropy": 2.2017061710357666, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22300279885530472, "step": 6252 }, { "epoch": 0.390875, "grad_norm": 2.640625, "grad_norm_var": 0.013525390625, "learning_rate": 0.0001, "loss": 7.3785, "loss/crossentropy": 2.1934911012649536, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23343044519424438, "step": 6254 }, { "epoch": 0.391, "grad_norm": 2.421875, "grad_norm_var": 0.025414021809895833, "learning_rate": 0.0001, "loss": 7.6714, "loss/crossentropy": 2.2382506132125854, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.24063657224178314, "step": 6256 }, { "epoch": 0.391125, "grad_norm": 2.578125, "grad_norm_var": 0.025651041666666666, "learning_rate": 0.0001, "loss": 7.521, "loss/crossentropy": 2.0697613954544067, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23750290274620056, "step": 6258 }, { "epoch": 0.39125, "grad_norm": 2.578125, "grad_norm_var": 0.020048014322916665, "learning_rate": 0.0001, "loss": 7.4932, "loss/crossentropy": 2.2755059003829956, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22635948657989502, "step": 6260 }, { "epoch": 0.391375, "grad_norm": 2.53125, "grad_norm_var": 0.019449869791666668, "learning_rate": 0.0001, "loss": 7.6384, "loss/crossentropy": 2.189927577972412, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.21912392228841782, "step": 6262 }, { "epoch": 0.3915, "grad_norm": 2.84375, "grad_norm_var": 0.023356119791666668, "learning_rate": 0.0001, "loss": 7.627, "loss/crossentropy": 2.3261609077453613, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2551092281937599, "step": 6264 }, { "epoch": 0.391625, "grad_norm": 2.734375, "grad_norm_var": 0.022391764322916667, "learning_rate": 0.0001, "loss": 7.8164, "loss/crossentropy": 2.3739001750946045, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.26913388073444366, "step": 6266 }, { "epoch": 0.39175, "grad_norm": 3.015625, "grad_norm_var": 0.03590087890625, "learning_rate": 0.0001, "loss": 7.6214, "loss/crossentropy": 2.248622179031372, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2420874983072281, "step": 6268 }, { "epoch": 0.391875, "grad_norm": 2.46875, "grad_norm_var": 0.03968098958333333, "learning_rate": 0.0001, "loss": 7.489, "loss/crossentropy": 2.2899194955825806, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22222957015037537, "step": 6270 }, { "epoch": 0.392, "grad_norm": 2.625, "grad_norm_var": 0.028278605143229166, "learning_rate": 0.0001, "loss": 7.5178, "loss/crossentropy": 2.376173257827759, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2134167104959488, "step": 6272 }, { "epoch": 0.392125, "grad_norm": 2.53125, "grad_norm_var": 0.027586873372395834, "learning_rate": 0.0001, "loss": 7.548, "loss/crossentropy": 2.318391442298889, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.21232503652572632, "step": 6274 }, { "epoch": 0.39225, "grad_norm": 2.359375, "grad_norm_var": 0.029899088541666667, "learning_rate": 0.0001, "loss": 7.5438, "loss/crossentropy": 2.2681090235710144, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22525951266288757, "step": 6276 }, { "epoch": 0.392375, "grad_norm": 2.609375, "grad_norm_var": 0.028547159830729165, "learning_rate": 0.0001, "loss": 7.4117, "loss/crossentropy": 2.25672447681427, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22654114663600922, "step": 6278 }, { "epoch": 0.3925, "grad_norm": 2.40625, "grad_norm_var": 0.028934733072916666, "learning_rate": 0.0001, "loss": 7.5579, "loss/crossentropy": 2.1942915320396423, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2309338003396988, "step": 6280 }, { "epoch": 0.392625, "grad_norm": 2.828125, "grad_norm_var": 0.03398335774739583, "learning_rate": 0.0001, "loss": 7.5399, "loss/crossentropy": 2.1502444744110107, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2250187247991562, "step": 6282 }, { "epoch": 0.39275, "grad_norm": 2.640625, "grad_norm_var": 0.055517578125, "learning_rate": 0.0001, "loss": 7.7109, "loss/crossentropy": 2.385079264640808, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22656168043613434, "step": 6284 }, { "epoch": 0.392875, "grad_norm": 2.5625, "grad_norm_var": 0.052994791666666666, "learning_rate": 0.0001, "loss": 7.7122, "loss/crossentropy": 2.515500783920288, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23578554391860962, "step": 6286 }, { "epoch": 0.393, "grad_norm": 2.5625, "grad_norm_var": 0.0533355712890625, "learning_rate": 0.0001, "loss": 7.5742, "loss/crossentropy": 2.1790542602539062, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22003192454576492, "step": 6288 }, { "epoch": 0.393125, "grad_norm": 2.796875, "grad_norm_var": 0.054320271809895834, "learning_rate": 0.0001, "loss": 7.6314, "loss/crossentropy": 2.25791597366333, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.256175234913826, "step": 6290 }, { "epoch": 0.39325, "grad_norm": 2.546875, "grad_norm_var": 0.04920247395833333, "learning_rate": 0.0001, "loss": 7.4662, "loss/crossentropy": 2.1893142461776733, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23055332899093628, "step": 6292 }, { "epoch": 0.393375, "grad_norm": 2.453125, "grad_norm_var": 0.05178120930989583, "learning_rate": 0.0001, "loss": 7.5163, "loss/crossentropy": 2.321172833442688, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23198582977056503, "step": 6294 }, { "epoch": 0.3935, "grad_norm": 2.609375, "grad_norm_var": 0.04810282389322917, "learning_rate": 0.0001, "loss": 7.4647, "loss/crossentropy": 2.320393919944763, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2342214211821556, "step": 6296 }, { "epoch": 0.393625, "grad_norm": 2.578125, "grad_norm_var": 0.0509185791015625, "learning_rate": 0.0001, "loss": 7.4012, "loss/crossentropy": 2.328033685684204, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2649778574705124, "step": 6298 }, { "epoch": 0.39375, "grad_norm": 2.6875, "grad_norm_var": 0.017170206705729166, "learning_rate": 0.0001, "loss": 7.54, "loss/crossentropy": 2.3670496940612793, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2493196278810501, "step": 6300 }, { "epoch": 0.393875, "grad_norm": 2.53125, "grad_norm_var": 0.018094889322916665, "learning_rate": 0.0001, "loss": 7.5042, "loss/crossentropy": 2.304697632789612, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22741590440273285, "step": 6302 }, { "epoch": 0.394, "grad_norm": 2.5625, "grad_norm_var": 0.018830362955729166, "learning_rate": 0.0001, "loss": 7.4154, "loss/crossentropy": 2.0195088982582092, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21824556589126587, "step": 6304 }, { "epoch": 0.394125, "grad_norm": 2.75, "grad_norm_var": 0.018163045247395832, "learning_rate": 0.0001, "loss": 7.6319, "loss/crossentropy": 2.2605401277542114, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23538069427013397, "step": 6306 }, { "epoch": 0.39425, "grad_norm": 2.5, "grad_norm_var": 0.01884765625, "learning_rate": 0.0001, "loss": 7.6752, "loss/crossentropy": 2.202141582965851, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22323383390903473, "step": 6308 }, { "epoch": 0.394375, "grad_norm": 2.546875, "grad_norm_var": 0.01881103515625, "learning_rate": 0.0001, "loss": 7.4471, "loss/crossentropy": 2.3312731981277466, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.213093139231205, "step": 6310 }, { "epoch": 0.3945, "grad_norm": 2.625, "grad_norm_var": 0.018680826822916666, "learning_rate": 0.0001, "loss": 7.7756, "loss/crossentropy": 2.368720054626465, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2570921257138252, "step": 6312 }, { "epoch": 0.394625, "grad_norm": 2.5, "grad_norm_var": 0.009309895833333333, "learning_rate": 0.0001, "loss": 7.6745, "loss/crossentropy": 2.3770391941070557, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2239127680659294, "step": 6314 }, { "epoch": 0.39475, "grad_norm": 2.6875, "grad_norm_var": 0.010530598958333333, "learning_rate": 0.0001, "loss": 7.5489, "loss/crossentropy": 2.150133490562439, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22410466521978378, "step": 6316 }, { "epoch": 0.394875, "grad_norm": 2.671875, "grad_norm_var": 0.013109334309895833, "learning_rate": 0.0001, "loss": 7.7113, "loss/crossentropy": 2.416159451007843, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.22610267996788025, "step": 6318 }, { "epoch": 0.395, "grad_norm": 2.515625, "grad_norm_var": 0.0137359619140625, "learning_rate": 0.0001, "loss": 7.358, "loss/crossentropy": 2.0061793327331543, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22514311224222183, "step": 6320 }, { "epoch": 0.395125, "grad_norm": 2.828125, "grad_norm_var": 0.015690104166666666, "learning_rate": 0.0001, "loss": 7.6503, "loss/crossentropy": 2.0940767526626587, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22432160377502441, "step": 6322 }, { "epoch": 0.39525, "grad_norm": 2.65625, "grad_norm_var": 0.016499837239583332, "learning_rate": 0.0001, "loss": 7.4605, "loss/crossentropy": 2.117647409439087, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2354767918586731, "step": 6324 }, { "epoch": 0.395375, "grad_norm": 2.828125, "grad_norm_var": 0.016803995768229166, "learning_rate": 0.0001, "loss": 7.6921, "loss/crossentropy": 2.4791641235351562, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24508479237556458, "step": 6326 }, { "epoch": 0.3955, "grad_norm": 2.546875, "grad_norm_var": 0.0174468994140625, "learning_rate": 0.0001, "loss": 7.6212, "loss/crossentropy": 2.451242685317993, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.26512502133846283, "step": 6328 }, { "epoch": 0.395625, "grad_norm": 2.625, "grad_norm_var": 0.01513671875, "learning_rate": 0.0001, "loss": 7.636, "loss/crossentropy": 2.418341040611267, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2341959998011589, "step": 6330 }, { "epoch": 0.39575, "grad_norm": 2.6875, "grad_norm_var": 0.0133209228515625, "learning_rate": 0.0001, "loss": 7.5658, "loss/crossentropy": 2.1039319038391113, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.25280052423477173, "step": 6332 }, { "epoch": 0.395875, "grad_norm": 2.640625, "grad_norm_var": 0.012972005208333333, "learning_rate": 0.0001, "loss": 7.6732, "loss/crossentropy": 2.3630772829055786, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22950495779514313, "step": 6334 }, { "epoch": 0.396, "grad_norm": 2.5625, "grad_norm_var": 0.013044230143229167, "learning_rate": 0.0001, "loss": 7.6265, "loss/crossentropy": 2.342318892478943, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23600289225578308, "step": 6336 }, { "epoch": 0.396125, "grad_norm": 2.78125, "grad_norm_var": 0.0139312744140625, "learning_rate": 0.0001, "loss": 7.6067, "loss/crossentropy": 2.426081895828247, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23234248161315918, "step": 6338 }, { "epoch": 0.39625, "grad_norm": 2.46875, "grad_norm_var": 0.013916015625, "learning_rate": 0.0001, "loss": 7.4614, "loss/crossentropy": 2.2226654291152954, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21805572509765625, "step": 6340 }, { "epoch": 0.396375, "grad_norm": 2.625, "grad_norm_var": 0.011091105143229167, "learning_rate": 0.0001, "loss": 7.653, "loss/crossentropy": 2.250340700149536, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2261437326669693, "step": 6342 }, { "epoch": 0.3965, "grad_norm": 2.5625, "grad_norm_var": 0.010871378580729167, "learning_rate": 0.0001, "loss": 7.8183, "loss/crossentropy": 2.4005974531173706, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23055587708950043, "step": 6344 }, { "epoch": 0.396625, "grad_norm": 2.546875, "grad_norm_var": 0.01070556640625, "learning_rate": 0.0001, "loss": 7.355, "loss/crossentropy": 2.0769636631011963, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.21554897725582123, "step": 6346 }, { "epoch": 0.39675, "grad_norm": 6.4375, "grad_norm_var": 0.951318359375, "learning_rate": 0.0001, "loss": 7.6142, "loss/crossentropy": 2.2895344495773315, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23039314895868301, "step": 6348 }, { "epoch": 0.396875, "grad_norm": 3.0, "grad_norm_var": 0.95660400390625, "learning_rate": 0.0001, "loss": 7.8202, "loss/crossentropy": 2.148575782775879, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23695151507854462, "step": 6350 }, { "epoch": 0.397, "grad_norm": 2.65625, "grad_norm_var": 0.9508941650390625, "learning_rate": 0.0001, "loss": 7.5864, "loss/crossentropy": 2.141268014907837, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21862905472517014, "step": 6352 }, { "epoch": 0.397125, "grad_norm": 2.71875, "grad_norm_var": 0.9495402018229167, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.085936427116394, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21454478800296783, "step": 6354 }, { "epoch": 0.39725, "grad_norm": 2.984375, "grad_norm_var": 0.935009765625, "learning_rate": 0.0001, "loss": 7.6014, "loss/crossentropy": 2.1236528158187866, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2477792203426361, "step": 6356 }, { "epoch": 0.397375, "grad_norm": 2.5, "grad_norm_var": 0.9444661458333333, "learning_rate": 0.0001, "loss": 7.4279, "loss/crossentropy": 2.0798850059509277, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.1996997371315956, "step": 6358 }, { "epoch": 0.3975, "grad_norm": 2.546875, "grad_norm_var": 0.94322509765625, "learning_rate": 0.0001, "loss": 7.6882, "loss/crossentropy": 2.269622802734375, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23176228255033493, "step": 6360 }, { "epoch": 0.397625, "grad_norm": 2.734375, "grad_norm_var": 0.9333241780598959, "learning_rate": 0.0001, "loss": 7.496, "loss/crossentropy": 2.3194905519485474, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2166258692741394, "step": 6362 }, { "epoch": 0.39775, "grad_norm": 2.765625, "grad_norm_var": 0.04990946451822917, "learning_rate": 0.0001, "loss": 7.7077, "loss/crossentropy": 2.3710728883743286, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24659988284111023, "step": 6364 }, { "epoch": 0.397875, "grad_norm": 2.46875, "grad_norm_var": 0.0197174072265625, "learning_rate": 0.0001, "loss": 7.3622, "loss/crossentropy": 2.0953763723373413, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.19251661747694016, "step": 6366 }, { "epoch": 0.398, "grad_norm": 2.734375, "grad_norm_var": 0.0206695556640625, "learning_rate": 0.0001, "loss": 7.5453, "loss/crossentropy": 2.179394483566284, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24496418237686157, "step": 6368 }, { "epoch": 0.398125, "grad_norm": 2.578125, "grad_norm_var": 0.019917805989583332, "learning_rate": 0.0001, "loss": 7.6042, "loss/crossentropy": 2.2830837965011597, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2160351723432541, "step": 6370 }, { "epoch": 0.39825, "grad_norm": 2.640625, "grad_norm_var": 0.011572265625, "learning_rate": 0.0001, "loss": 7.8097, "loss/crossentropy": 2.4033570289611816, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2354504019021988, "step": 6372 }, { "epoch": 0.398375, "grad_norm": 2.75, "grad_norm_var": 0.009618123372395834, "learning_rate": 0.0001, "loss": 7.5356, "loss/crossentropy": 1.9328319430351257, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21225574612617493, "step": 6374 }, { "epoch": 0.3985, "grad_norm": 2.46875, "grad_norm_var": 0.011034138997395833, "learning_rate": 0.0001, "loss": 7.5251, "loss/crossentropy": 2.5008562803268433, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23024940490722656, "step": 6376 }, { "epoch": 0.398625, "grad_norm": 2.65625, "grad_norm_var": 0.01041259765625, "learning_rate": 0.0001, "loss": 7.528, "loss/crossentropy": 2.1242050528526306, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.21649522334337234, "step": 6378 }, { "epoch": 0.39875, "grad_norm": 2.546875, "grad_norm_var": 0.0082427978515625, "learning_rate": 0.0001, "loss": 7.6492, "loss/crossentropy": 2.1073070764541626, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21627923846244812, "step": 6380 }, { "epoch": 0.398875, "grad_norm": 2.5625, "grad_norm_var": 0.00738525390625, "learning_rate": 0.0001, "loss": 7.7412, "loss/crossentropy": 2.4694966077804565, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2295985370874405, "step": 6382 }, { "epoch": 0.399, "grad_norm": 2.53125, "grad_norm_var": 0.005353800455729167, "learning_rate": 0.0001, "loss": 7.6005, "loss/crossentropy": 2.361127734184265, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2265416458249092, "step": 6384 }, { "epoch": 0.399125, "grad_norm": 3.109375, "grad_norm_var": 0.024104817708333334, "learning_rate": 0.0001, "loss": 7.5183, "loss/crossentropy": 2.186962842941284, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21863600611686707, "step": 6386 }, { "epoch": 0.39925, "grad_norm": 2.6875, "grad_norm_var": 0.023729451497395835, "learning_rate": 0.0001, "loss": 7.4574, "loss/crossentropy": 2.4032983779907227, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21744444221258163, "step": 6388 }, { "epoch": 0.399375, "grad_norm": 2.71875, "grad_norm_var": 0.023265584309895834, "learning_rate": 0.0001, "loss": 7.8294, "loss/crossentropy": 2.5293025970458984, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2249559760093689, "step": 6390 }, { "epoch": 0.3995, "grad_norm": 2.5625, "grad_norm_var": 0.021891276041666668, "learning_rate": 0.0001, "loss": 7.4913, "loss/crossentropy": 2.0256664752960205, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.21668323129415512, "step": 6392 }, { "epoch": 0.399625, "grad_norm": 2.796875, "grad_norm_var": 0.024193318684895833, "learning_rate": 0.0001, "loss": 7.6356, "loss/crossentropy": 2.1840275526046753, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21782296150922775, "step": 6394 }, { "epoch": 0.39975, "grad_norm": 2.609375, "grad_norm_var": 0.023828125, "learning_rate": 0.0001, "loss": 7.5591, "loss/crossentropy": 2.3117536306381226, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2328423708677292, "step": 6396 }, { "epoch": 0.399875, "grad_norm": 2.40625, "grad_norm_var": 0.026302083333333334, "learning_rate": 0.0001, "loss": 7.4801, "loss/crossentropy": 1.9110868573188782, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22831560671329498, "step": 6398 }, { "epoch": 0.4, "grad_norm": 2.5625, "grad_norm_var": 0.0258453369140625, "learning_rate": 0.0001, "loss": 7.746, "loss/crossentropy": 2.3806575536727905, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23545508086681366, "step": 6400 }, { "epoch": 0.400125, "grad_norm": 2.640625, "grad_norm_var": 0.010026041666666667, "learning_rate": 0.0001, "loss": 7.4999, "loss/crossentropy": 2.3991007804870605, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23664527386426926, "step": 6402 }, { "epoch": 0.40025, "grad_norm": 2.65625, "grad_norm_var": 0.010277303059895833, "learning_rate": 0.0001, "loss": 7.6897, "loss/crossentropy": 2.2968008518218994, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.217793270945549, "step": 6404 }, { "epoch": 0.400375, "grad_norm": 2.640625, "grad_norm_var": 0.00963134765625, "learning_rate": 0.0001, "loss": 7.4137, "loss/crossentropy": 2.204868793487549, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2332460805773735, "step": 6406 }, { "epoch": 0.4005, "grad_norm": 2.765625, "grad_norm_var": 0.011579386393229167, "learning_rate": 0.0001, "loss": 7.5019, "loss/crossentropy": 2.0902740359306335, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2305055856704712, "step": 6408 }, { "epoch": 0.400625, "grad_norm": 2.65625, "grad_norm_var": 0.00986328125, "learning_rate": 0.0001, "loss": 7.7726, "loss/crossentropy": 2.400550127029419, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2618885189294815, "step": 6410 }, { "epoch": 0.40075, "grad_norm": 2.59375, "grad_norm_var": 0.0096343994140625, "learning_rate": 0.0001, "loss": 7.7652, "loss/crossentropy": 2.4428645372390747, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24537986516952515, "step": 6412 }, { "epoch": 0.400875, "grad_norm": 2.75, "grad_norm_var": 0.00826416015625, "learning_rate": 0.0001, "loss": 7.5609, "loss/crossentropy": 2.4830206632614136, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2565157115459442, "step": 6414 }, { "epoch": 0.401, "grad_norm": 2.65625, "grad_norm_var": 0.0080230712890625, "learning_rate": 0.0001, "loss": 7.7025, "loss/crossentropy": 2.400224804878235, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24681121110916138, "step": 6416 }, { "epoch": 0.401125, "grad_norm": 2.625, "grad_norm_var": 0.00640869140625, "learning_rate": 0.0001, "loss": 7.3575, "loss/crossentropy": 2.2659554481506348, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2184985727071762, "step": 6418 }, { "epoch": 0.40125, "grad_norm": 2.484375, "grad_norm_var": 0.0075266520182291664, "learning_rate": 0.0001, "loss": 7.5334, "loss/crossentropy": 2.0715479850769043, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21770843863487244, "step": 6420 }, { "epoch": 0.401375, "grad_norm": 2.703125, "grad_norm_var": 0.007840983072916667, "learning_rate": 0.0001, "loss": 7.565, "loss/crossentropy": 2.072678565979004, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21133434772491455, "step": 6422 }, { "epoch": 0.4015, "grad_norm": 2.6875, "grad_norm_var": 0.0073638916015625, "learning_rate": 0.0001, "loss": 7.6415, "loss/crossentropy": 2.2100725769996643, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2077169343829155, "step": 6424 }, { "epoch": 0.401625, "grad_norm": 2.796875, "grad_norm_var": 0.010081990559895834, "learning_rate": 0.0001, "loss": 7.4601, "loss/crossentropy": 2.211206555366516, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22444123029708862, "step": 6426 }, { "epoch": 0.40175, "grad_norm": 2.546875, "grad_norm_var": 0.010358683268229167, "learning_rate": 0.0001, "loss": 7.483, "loss/crossentropy": 2.360229730606079, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22969259321689606, "step": 6428 }, { "epoch": 0.401875, "grad_norm": 2.71875, "grad_norm_var": 0.008870442708333334, "learning_rate": 0.0001, "loss": 7.7357, "loss/crossentropy": 2.459460973739624, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22635357826948166, "step": 6430 }, { "epoch": 0.402, "grad_norm": 2.546875, "grad_norm_var": 0.010480753580729167, "learning_rate": 0.0001, "loss": 7.5855, "loss/crossentropy": 2.229428768157959, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23950792104005814, "step": 6432 }, { "epoch": 0.402125, "grad_norm": 2.546875, "grad_norm_var": 0.0113433837890625, "learning_rate": 0.0001, "loss": 7.7009, "loss/crossentropy": 2.4964840412139893, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24381062388420105, "step": 6434 }, { "epoch": 0.40225, "grad_norm": 2.484375, "grad_norm_var": 0.0106597900390625, "learning_rate": 0.0001, "loss": 7.5846, "loss/crossentropy": 2.4857131242752075, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22634105384349823, "step": 6436 }, { "epoch": 0.402375, "grad_norm": 2.78125, "grad_norm_var": 0.011847941080729167, "learning_rate": 0.0001, "loss": 7.7486, "loss/crossentropy": 2.3042640686035156, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2516568899154663, "step": 6438 }, { "epoch": 0.4025, "grad_norm": 2.59375, "grad_norm_var": 0.013630167643229166, "learning_rate": 0.0001, "loss": 7.5613, "loss/crossentropy": 2.2769718170166016, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22594355791807175, "step": 6440 }, { "epoch": 0.402625, "grad_norm": 5.65625, "grad_norm_var": 0.58092041015625, "learning_rate": 0.0001, "loss": 7.6713, "loss/crossentropy": 2.3272628784179688, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2554403692483902, "step": 6442 }, { "epoch": 0.40275, "grad_norm": 2.765625, "grad_norm_var": 0.5735585530598958, "learning_rate": 0.0001, "loss": 7.5097, "loss/crossentropy": 2.272169828414917, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24848252534866333, "step": 6444 }, { "epoch": 0.402875, "grad_norm": 2.703125, "grad_norm_var": 0.5789621988932292, "learning_rate": 0.0001, "loss": 7.4988, "loss/crossentropy": 2.169219136238098, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2193443328142166, "step": 6446 }, { "epoch": 0.403, "grad_norm": 11.1875, "grad_norm_var": 4.9101308186848955, "learning_rate": 0.0001, "loss": 7.9161, "loss/crossentropy": 2.0119059681892395, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23552437126636505, "step": 6448 }, { "epoch": 0.403125, "grad_norm": 2.921875, "grad_norm_var": 4.852978515625, "learning_rate": 0.0001, "loss": 7.74, "loss/crossentropy": 2.2246367931365967, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2612927258014679, "step": 6450 }, { "epoch": 0.40325, "grad_norm": 2.6875, "grad_norm_var": 4.813011678059896, "learning_rate": 0.0001, "loss": 7.7199, "loss/crossentropy": 2.3895065784454346, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2671157121658325, "step": 6452 }, { "epoch": 0.403375, "grad_norm": 2.578125, "grad_norm_var": 4.865664672851563, "learning_rate": 0.0001, "loss": 7.5549, "loss/crossentropy": 2.342351198196411, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.228829026222229, "step": 6454 }, { "epoch": 0.4035, "grad_norm": 2.6875, "grad_norm_var": 4.88873291015625, "learning_rate": 0.0001, "loss": 7.3488, "loss/crossentropy": 2.082708179950714, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.20652340352535248, "step": 6456 }, { "epoch": 0.403625, "grad_norm": 2.59375, "grad_norm_var": 4.548322550455729, "learning_rate": 0.0001, "loss": 7.7046, "loss/crossentropy": 2.082743525505066, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22296911478042603, "step": 6458 }, { "epoch": 0.40375, "grad_norm": 2.609375, "grad_norm_var": 4.567096964518229, "learning_rate": 0.0001, "loss": 7.6053, "loss/crossentropy": 2.164341628551483, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.19376349449157715, "step": 6460 }, { "epoch": 0.403875, "grad_norm": 2.671875, "grad_norm_var": 4.55084228515625, "learning_rate": 0.0001, "loss": 7.5639, "loss/crossentropy": 2.3109676837921143, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2219085544347763, "step": 6462 }, { "epoch": 0.404, "grad_norm": 2.65625, "grad_norm_var": 0.0313385009765625, "learning_rate": 0.0001, "loss": 7.5475, "loss/crossentropy": 2.282227873802185, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22183822095394135, "step": 6464 }, { "epoch": 0.404125, "grad_norm": 2.609375, "grad_norm_var": 0.00963134765625, "learning_rate": 0.0001, "loss": 7.5599, "loss/crossentropy": 2.1522287130355835, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.221329964697361, "step": 6466 }, { "epoch": 0.40425, "grad_norm": 2.703125, "grad_norm_var": 0.00963134765625, "learning_rate": 0.0001, "loss": 7.8553, "loss/crossentropy": 2.567604184150696, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24820351600646973, "step": 6468 }, { "epoch": 0.404375, "grad_norm": 2.53125, "grad_norm_var": 0.009137980143229167, "learning_rate": 0.0001, "loss": 7.616, "loss/crossentropy": 2.3556437492370605, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2341202348470688, "step": 6470 }, { "epoch": 0.4045, "grad_norm": 2.71875, "grad_norm_var": 0.010090128580729166, "learning_rate": 0.0001, "loss": 7.6643, "loss/crossentropy": 2.188850998878479, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22609790414571762, "step": 6472 }, { "epoch": 0.404625, "grad_norm": 2.75, "grad_norm_var": 0.011844889322916666, "learning_rate": 0.0001, "loss": 7.5213, "loss/crossentropy": 2.2338500022888184, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24597671627998352, "step": 6474 }, { "epoch": 0.40475, "grad_norm": 2.53125, "grad_norm_var": 0.012516276041666666, "learning_rate": 0.0001, "loss": 7.5833, "loss/crossentropy": 2.4775822162628174, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23304374516010284, "step": 6476 }, { "epoch": 0.404875, "grad_norm": 2.765625, "grad_norm_var": 0.015973917643229165, "learning_rate": 0.0001, "loss": 7.6762, "loss/crossentropy": 2.37015438079834, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22350409626960754, "step": 6478 }, { "epoch": 0.405, "grad_norm": 2.84375, "grad_norm_var": 0.020173136393229166, "learning_rate": 0.0001, "loss": 7.7956, "loss/crossentropy": 2.1530754566192627, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23307234048843384, "step": 6480 }, { "epoch": 0.405125, "grad_norm": 2.5625, "grad_norm_var": 0.020441691080729168, "learning_rate": 0.0001, "loss": 7.7633, "loss/crossentropy": 2.6825673580169678, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24624750763177872, "step": 6482 }, { "epoch": 0.40525, "grad_norm": 2.59375, "grad_norm_var": 0.020231119791666665, "learning_rate": 0.0001, "loss": 7.4004, "loss/crossentropy": 2.236609697341919, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2373785674571991, "step": 6484 }, { "epoch": 0.405375, "grad_norm": 2.609375, "grad_norm_var": 0.016218058268229165, "learning_rate": 0.0001, "loss": 7.5584, "loss/crossentropy": 2.2639390230178833, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2137163206934929, "step": 6486 }, { "epoch": 0.4055, "grad_norm": 2.609375, "grad_norm_var": 0.014839680989583333, "learning_rate": 0.0001, "loss": 7.5184, "loss/crossentropy": 2.4059932231903076, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2348843514919281, "step": 6488 }, { "epoch": 0.405625, "grad_norm": 2.453125, "grad_norm_var": 0.014891560872395833, "learning_rate": 0.0001, "loss": 7.6856, "loss/crossentropy": 2.390839695930481, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23600734025239944, "step": 6490 }, { "epoch": 0.40575, "grad_norm": 2.609375, "grad_norm_var": 0.015299479166666666, "learning_rate": 0.0001, "loss": 7.5672, "loss/crossentropy": 2.4925975799560547, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2324211597442627, "step": 6492 }, { "epoch": 0.405875, "grad_norm": 2.484375, "grad_norm_var": 0.013004557291666666, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 2.254358649253845, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22002553939819336, "step": 6494 }, { "epoch": 0.406, "grad_norm": 2.484375, "grad_norm_var": 0.0051910400390625, "learning_rate": 0.0001, "loss": 7.561, "loss/crossentropy": 1.944933831691742, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22056113183498383, "step": 6496 }, { "epoch": 0.406125, "grad_norm": 2.6875, "grad_norm_var": 0.004930623372395833, "learning_rate": 0.0001, "loss": 7.7594, "loss/crossentropy": 2.30185067653656, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23535191267728806, "step": 6498 }, { "epoch": 0.40625, "grad_norm": 2.609375, "grad_norm_var": 0.0050933837890625, "learning_rate": 0.0001, "loss": 7.7826, "loss/crossentropy": 2.468358635902405, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.25658316165208817, "step": 6500 }, { "epoch": 0.406375, "grad_norm": 2.515625, "grad_norm_var": 0.005204264322916667, "learning_rate": 0.0001, "loss": 7.4793, "loss/crossentropy": 2.2335681915283203, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23286845535039902, "step": 6502 }, { "epoch": 0.4065, "grad_norm": 2.546875, "grad_norm_var": 0.006136067708333333, "learning_rate": 0.0001, "loss": 7.4858, "loss/crossentropy": 2.3515396118164062, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23640403896570206, "step": 6504 }, { "epoch": 0.406625, "grad_norm": 2.578125, "grad_norm_var": 0.0144195556640625, "learning_rate": 0.0001, "loss": 7.5546, "loss/crossentropy": 2.2726809978485107, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2327079027891159, "step": 6506 }, { "epoch": 0.40675, "grad_norm": 2.5, "grad_norm_var": 0.01861572265625, "learning_rate": 0.0001, "loss": 7.6823, "loss/crossentropy": 2.145686626434326, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22933802753686905, "step": 6508 }, { "epoch": 0.406875, "grad_norm": 2.59375, "grad_norm_var": 0.0168853759765625, "learning_rate": 0.0001, "loss": 7.7906, "loss/crossentropy": 2.5454604625701904, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22263063490390778, "step": 6510 }, { "epoch": 0.407, "grad_norm": 2.546875, "grad_norm_var": 0.018017578125, "learning_rate": 0.0001, "loss": 7.5031, "loss/crossentropy": 2.1542991399765015, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21461249142885208, "step": 6512 }, { "epoch": 0.407125, "grad_norm": 2.828125, "grad_norm_var": 0.020702107747395834, "learning_rate": 0.0001, "loss": 7.6144, "loss/crossentropy": 2.316395878791809, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23294207453727722, "step": 6514 }, { "epoch": 0.40725, "grad_norm": 2.703125, "grad_norm_var": 0.0212554931640625, "learning_rate": 0.0001, "loss": 7.5582, "loss/crossentropy": 2.303479313850403, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.1987205669283867, "step": 6516 }, { "epoch": 0.407375, "grad_norm": 2.640625, "grad_norm_var": 0.02086181640625, "learning_rate": 0.0001, "loss": 7.457, "loss/crossentropy": 2.2391674518585205, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22896330058574677, "step": 6518 }, { "epoch": 0.4075, "grad_norm": 2.609375, "grad_norm_var": 0.023388671875, "learning_rate": 0.0001, "loss": 7.5562, "loss/crossentropy": 2.103419542312622, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21157852560281754, "step": 6520 }, { "epoch": 0.407625, "grad_norm": 2.796875, "grad_norm_var": 0.0190826416015625, "learning_rate": 0.0001, "loss": 7.5634, "loss/crossentropy": 2.2614264488220215, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21028782427310944, "step": 6522 }, { "epoch": 0.40775, "grad_norm": 2.8125, "grad_norm_var": 0.017350260416666666, "learning_rate": 0.0001, "loss": 7.6181, "loss/crossentropy": 2.25162136554718, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23185992240905762, "step": 6524 }, { "epoch": 0.407875, "grad_norm": 2.609375, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 7.5725, "loss/crossentropy": 2.249953508377075, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23878604173660278, "step": 6526 }, { "epoch": 0.408, "grad_norm": 2.5625, "grad_norm_var": 0.015104166666666667, "learning_rate": 0.0001, "loss": 7.4301, "loss/crossentropy": 2.099335551261902, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21443204581737518, "step": 6528 }, { "epoch": 0.408125, "grad_norm": 2.5625, "grad_norm_var": 0.013407389322916666, "learning_rate": 0.0001, "loss": 7.4584, "loss/crossentropy": 2.290423274040222, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24671924859285355, "step": 6530 }, { "epoch": 0.40825, "grad_norm": 2.65625, "grad_norm_var": 0.012955729166666667, "learning_rate": 0.0001, "loss": 7.5475, "loss/crossentropy": 2.3274335861206055, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24327465891838074, "step": 6532 }, { "epoch": 0.408375, "grad_norm": 3.03125, "grad_norm_var": 0.02310791015625, "learning_rate": 0.0001, "loss": 7.4913, "loss/crossentropy": 2.2215962409973145, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2649707570672035, "step": 6534 }, { "epoch": 0.4085, "grad_norm": 2.703125, "grad_norm_var": 0.031981404622395834, "learning_rate": 0.0001, "loss": 7.5753, "loss/crossentropy": 2.2809780836105347, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2313680276274681, "step": 6536 }, { "epoch": 0.408625, "grad_norm": 2.71875, "grad_norm_var": 0.031396484375, "learning_rate": 0.0001, "loss": 7.8391, "loss/crossentropy": 2.380458950996399, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22678998112678528, "step": 6538 }, { "epoch": 0.40875, "grad_norm": 2.53125, "grad_norm_var": 0.03434956868489583, "learning_rate": 0.0001, "loss": 7.3035, "loss/crossentropy": 1.896793246269226, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.18995589762926102, "step": 6540 }, { "epoch": 0.408875, "grad_norm": 2.671875, "grad_norm_var": 0.03417561848958333, "learning_rate": 0.0001, "loss": 7.5149, "loss/crossentropy": 2.2268275022506714, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2373184710741043, "step": 6542 }, { "epoch": 0.409, "grad_norm": 2.71875, "grad_norm_var": 0.046141560872395834, "learning_rate": 0.0001, "loss": 7.527, "loss/crossentropy": 2.4108771085739136, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24322284758090973, "step": 6544 }, { "epoch": 0.409125, "grad_norm": 3.84375, "grad_norm_var": 0.11757405598958333, "learning_rate": 0.0001, "loss": 7.616, "loss/crossentropy": 2.3702893257141113, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23530493676662445, "step": 6546 }, { "epoch": 0.40925, "grad_norm": 2.59375, "grad_norm_var": 0.12656962076822917, "learning_rate": 0.0001, "loss": 7.5812, "loss/crossentropy": 2.23604679107666, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2326735109090805, "step": 6548 }, { "epoch": 0.409375, "grad_norm": 2.546875, "grad_norm_var": 0.13168843587239584, "learning_rate": 0.0001, "loss": 7.6394, "loss/crossentropy": 2.300023674964905, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24932373315095901, "step": 6550 }, { "epoch": 0.4095, "grad_norm": 2.453125, "grad_norm_var": 0.12976786295572917, "learning_rate": 0.0001, "loss": 7.4564, "loss/crossentropy": 2.1024811267852783, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.20963046699762344, "step": 6552 }, { "epoch": 0.409625, "grad_norm": 2.75, "grad_norm_var": 0.12984619140625, "learning_rate": 0.0001, "loss": 7.7796, "loss/crossentropy": 2.1907604932785034, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2528301328420639, "step": 6554 }, { "epoch": 0.40975, "grad_norm": 2.78125, "grad_norm_var": 0.11998291015625, "learning_rate": 0.0001, "loss": 7.5188, "loss/crossentropy": 2.244860053062439, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23077642172574997, "step": 6556 }, { "epoch": 0.409875, "grad_norm": 2.65625, "grad_norm_var": 0.11931966145833334, "learning_rate": 0.0001, "loss": 7.4776, "loss/crossentropy": 2.0770727396011353, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22512366622686386, "step": 6558 }, { "epoch": 0.41, "grad_norm": 2.609375, "grad_norm_var": 0.11503499348958333, "learning_rate": 0.0001, "loss": 7.5796, "loss/crossentropy": 2.2558538913726807, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22963982075452805, "step": 6560 }, { "epoch": 0.410125, "grad_norm": 2.609375, "grad_norm_var": 0.03134358723958333, "learning_rate": 0.0001, "loss": 7.6013, "loss/crossentropy": 2.3383020162582397, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2459249421954155, "step": 6562 }, { "epoch": 0.41025, "grad_norm": 2.71875, "grad_norm_var": 0.009422810872395833, "learning_rate": 0.0001, "loss": 7.7781, "loss/crossentropy": 2.4201927185058594, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2378387302160263, "step": 6564 }, { "epoch": 0.410375, "grad_norm": 2.546875, "grad_norm_var": 0.011156209309895833, "learning_rate": 0.0001, "loss": 7.4348, "loss/crossentropy": 2.4735056161880493, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23408573120832443, "step": 6566 }, { "epoch": 0.4105, "grad_norm": 2.46875, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 7.4129, "loss/crossentropy": 2.2188019156455994, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22182120382785797, "step": 6568 }, { "epoch": 0.410625, "grad_norm": 3.0, "grad_norm_var": 0.020145670572916666, "learning_rate": 0.0001, "loss": 7.4417, "loss/crossentropy": 2.3368951082229614, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23602771759033203, "step": 6570 }, { "epoch": 0.41075, "grad_norm": 2.796875, "grad_norm_var": 0.02086181640625, "learning_rate": 0.0001, "loss": 7.6761, "loss/crossentropy": 2.4886250495910645, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23062077164649963, "step": 6572 }, { "epoch": 0.410875, "grad_norm": 2.546875, "grad_norm_var": 0.021317545572916666, "learning_rate": 0.0001, "loss": 7.5844, "loss/crossentropy": 2.0759971141815186, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23338718712329865, "step": 6574 }, { "epoch": 0.411, "grad_norm": 2.640625, "grad_norm_var": 0.021222941080729165, "learning_rate": 0.0001, "loss": 7.665, "loss/crossentropy": 2.4548429250717163, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2538658156991005, "step": 6576 }, { "epoch": 0.411125, "grad_norm": 2.40625, "grad_norm_var": 0.024421183268229167, "learning_rate": 0.0001, "loss": 7.5296, "loss/crossentropy": 2.1038975715637207, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21533842384815216, "step": 6578 }, { "epoch": 0.41125, "grad_norm": 2.59375, "grad_norm_var": 0.024421183268229167, "learning_rate": 0.0001, "loss": 7.7126, "loss/crossentropy": 2.441248655319214, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.25519800186157227, "step": 6580 }, { "epoch": 0.411375, "grad_norm": 2.40625, "grad_norm_var": 0.025651041666666666, "learning_rate": 0.0001, "loss": 7.5036, "loss/crossentropy": 2.4303916692733765, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2536260038614273, "step": 6582 }, { "epoch": 0.4115, "grad_norm": 2.625, "grad_norm_var": 0.020946248372395834, "learning_rate": 0.0001, "loss": 7.5953, "loss/crossentropy": 2.196594476699829, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.23726627230644226, "step": 6584 }, { "epoch": 0.411625, "grad_norm": 2.546875, "grad_norm_var": 0.024583943684895835, "learning_rate": 0.0001, "loss": 7.444, "loss/crossentropy": 2.2186540365219116, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24524381756782532, "step": 6586 }, { "epoch": 0.41175, "grad_norm": 2.859375, "grad_norm_var": 0.0258941650390625, "learning_rate": 0.0001, "loss": 7.7625, "loss/crossentropy": 2.324312210083008, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.25367560237646103, "step": 6588 }, { "epoch": 0.411875, "grad_norm": 2.53125, "grad_norm_var": 0.02607421875, "learning_rate": 0.0001, "loss": 7.6039, "loss/crossentropy": 2.256448745727539, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22934693098068237, "step": 6590 }, { "epoch": 0.412, "grad_norm": 2.59375, "grad_norm_var": 0.0262359619140625, "learning_rate": 0.0001, "loss": 7.5264, "loss/crossentropy": 2.296054482460022, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2321232706308365, "step": 6592 }, { "epoch": 0.412125, "grad_norm": 2.8125, "grad_norm_var": 0.024674479166666666, "learning_rate": 0.0001, "loss": 7.8346, "loss/crossentropy": 2.4963600635528564, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23575764149427414, "step": 6594 }, { "epoch": 0.41225, "grad_norm": 2.796875, "grad_norm_var": 0.026790364583333334, "learning_rate": 0.0001, "loss": 7.6727, "loss/crossentropy": 2.213708281517029, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23489990830421448, "step": 6596 }, { "epoch": 0.412375, "grad_norm": 2.5625, "grad_norm_var": 0.023583984375, "learning_rate": 0.0001, "loss": 7.4344, "loss/crossentropy": 2.1356585025787354, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23074238747358322, "step": 6598 }, { "epoch": 0.4125, "grad_norm": 2.53125, "grad_norm_var": 0.02603759765625, "learning_rate": 0.0001, "loss": 7.4891, "loss/crossentropy": 2.144460916519165, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.22947844862937927, "step": 6600 }, { "epoch": 0.412625, "grad_norm": 2.53125, "grad_norm_var": 0.01558837890625, "learning_rate": 0.0001, "loss": 7.5585, "loss/crossentropy": 2.3757526874542236, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2253900170326233, "step": 6602 }, { "epoch": 0.41275, "grad_norm": 2.703125, "grad_norm_var": 0.0123687744140625, "learning_rate": 0.0001, "loss": 7.5179, "loss/crossentropy": 2.2484768629074097, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.223810575902462, "step": 6604 }, { "epoch": 0.412875, "grad_norm": 2.421875, "grad_norm_var": 0.014696248372395833, "learning_rate": 0.0001, "loss": 7.3557, "loss/crossentropy": 2.1417400240898132, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20718573033809662, "step": 6606 }, { "epoch": 0.413, "grad_norm": 2.75, "grad_norm_var": 0.016820271809895832, "learning_rate": 0.0001, "loss": 7.5026, "loss/crossentropy": 2.024193048477173, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21772243082523346, "step": 6608 }, { "epoch": 0.413125, "grad_norm": 2.53125, "grad_norm_var": 0.011747233072916667, "learning_rate": 0.0001, "loss": 7.4232, "loss/crossentropy": 2.1193546056747437, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2398684024810791, "step": 6610 }, { "epoch": 0.41325, "grad_norm": 2.515625, "grad_norm_var": 0.008072916666666667, "learning_rate": 0.0001, "loss": 7.4945, "loss/crossentropy": 2.1500428915023804, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22196128964424133, "step": 6612 }, { "epoch": 0.413375, "grad_norm": 2.59375, "grad_norm_var": 0.021382649739583332, "learning_rate": 0.0001, "loss": 7.8258, "loss/crossentropy": 2.3873664140701294, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22806718200445175, "step": 6614 }, { "epoch": 0.4135, "grad_norm": 2.53125, "grad_norm_var": 0.0213043212890625, "learning_rate": 0.0001, "loss": 7.6071, "loss/crossentropy": 2.2266165018081665, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22689025849103928, "step": 6616 }, { "epoch": 0.413625, "grad_norm": 2.515625, "grad_norm_var": 0.020048014322916665, "learning_rate": 0.0001, "loss": 7.4037, "loss/crossentropy": 2.2330875396728516, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23158963024616241, "step": 6618 }, { "epoch": 0.41375, "grad_norm": 2.703125, "grad_norm_var": 0.01959228515625, "learning_rate": 0.0001, "loss": 7.6731, "loss/crossentropy": 2.211314558982849, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2412443310022354, "step": 6620 }, { "epoch": 0.413875, "grad_norm": 2.734375, "grad_norm_var": 0.017154947916666666, "learning_rate": 0.0001, "loss": 7.7292, "loss/crossentropy": 2.2129745483398438, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26352236419916153, "step": 6622 }, { "epoch": 0.414, "grad_norm": 2.546875, "grad_norm_var": 0.0154937744140625, "learning_rate": 0.0001, "loss": 7.3652, "loss/crossentropy": 2.430734157562256, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2314162403345108, "step": 6624 }, { "epoch": 0.414125, "grad_norm": 2.5, "grad_norm_var": 0.015599568684895834, "learning_rate": 0.0001, "loss": 7.613, "loss/crossentropy": 2.241546630859375, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2257830947637558, "step": 6626 }, { "epoch": 0.41425, "grad_norm": 2.671875, "grad_norm_var": 0.014989217122395834, "learning_rate": 0.0001, "loss": 7.7027, "loss/crossentropy": 2.545127749443054, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23514911532402039, "step": 6628 }, { "epoch": 0.414375, "grad_norm": 2.640625, "grad_norm_var": 0.005257161458333334, "learning_rate": 0.0001, "loss": 7.7761, "loss/crossentropy": 2.1145858764648438, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2400895357131958, "step": 6630 }, { "epoch": 0.4145, "grad_norm": 2.5, "grad_norm_var": 0.005659993489583333, "learning_rate": 0.0001, "loss": 7.501, "loss/crossentropy": 2.353818416595459, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24076513200998306, "step": 6632 }, { "epoch": 0.414625, "grad_norm": 2.484375, "grad_norm_var": 0.006180826822916667, "learning_rate": 0.0001, "loss": 7.5028, "loss/crossentropy": 2.2518410682678223, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2216123417019844, "step": 6634 }, { "epoch": 0.41475, "grad_norm": 2.53125, "grad_norm_var": 0.005908203125, "learning_rate": 0.0001, "loss": 7.5763, "loss/crossentropy": 2.1215949654579163, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22310344129800797, "step": 6636 }, { "epoch": 0.414875, "grad_norm": 2.53125, "grad_norm_var": 0.0050852457682291664, "learning_rate": 0.0001, "loss": 7.5269, "loss/crossentropy": 2.167725086212158, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.21320636570453644, "step": 6638 }, { "epoch": 0.415, "grad_norm": 2.71875, "grad_norm_var": 0.00771484375, "learning_rate": 0.0001, "loss": 7.7989, "loss/crossentropy": 2.3983415365219116, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24660098552703857, "step": 6640 }, { "epoch": 0.415125, "grad_norm": 2.625, "grad_norm_var": 0.006982421875, "learning_rate": 0.0001, "loss": 7.4993, "loss/crossentropy": 2.0721535682678223, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2422334849834442, "step": 6642 }, { "epoch": 0.41525, "grad_norm": 2.6875, "grad_norm_var": 0.008185831705729167, "learning_rate": 0.0001, "loss": 7.7734, "loss/crossentropy": 1.992736041545868, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22879430651664734, "step": 6644 }, { "epoch": 0.415375, "grad_norm": 2.65625, "grad_norm_var": 0.008056640625, "learning_rate": 0.0001, "loss": 7.4959, "loss/crossentropy": 2.3352534770965576, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2520042806863785, "step": 6646 }, { "epoch": 0.4155, "grad_norm": 2.75, "grad_norm_var": 0.026676432291666666, "learning_rate": 0.0001, "loss": 7.7194, "loss/crossentropy": 2.1757973432540894, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2261456400156021, "step": 6648 }, { "epoch": 0.415625, "grad_norm": 2.6875, "grad_norm_var": 0.025712076822916666, "learning_rate": 0.0001, "loss": 7.5891, "loss/crossentropy": 2.2398927807807922, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22077034413814545, "step": 6650 }, { "epoch": 0.41575, "grad_norm": 2.5625, "grad_norm_var": 0.024787394205729167, "learning_rate": 0.0001, "loss": 7.5171, "loss/crossentropy": 2.1476768255233765, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2027449607849121, "step": 6652 }, { "epoch": 0.415875, "grad_norm": 2.640625, "grad_norm_var": 0.030467732747395834, "learning_rate": 0.0001, "loss": 7.6189, "loss/crossentropy": 2.198335886001587, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.21351244300603867, "step": 6654 }, { "epoch": 0.416, "grad_norm": 2.609375, "grad_norm_var": 0.03127848307291667, "learning_rate": 0.0001, "loss": 7.6406, "loss/crossentropy": 2.40294873714447, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21814744174480438, "step": 6656 }, { "epoch": 0.416125, "grad_norm": 2.53125, "grad_norm_var": 0.03255106608072917, "learning_rate": 0.0001, "loss": 7.5104, "loss/crossentropy": 2.4102083444595337, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22978480905294418, "step": 6658 }, { "epoch": 0.41625, "grad_norm": 2.53125, "grad_norm_var": 0.031538899739583334, "learning_rate": 0.0001, "loss": 7.4799, "loss/crossentropy": 2.4791085720062256, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23873702436685562, "step": 6660 }, { "epoch": 0.416375, "grad_norm": 2.53125, "grad_norm_var": 0.03215230305989583, "learning_rate": 0.0001, "loss": 7.7792, "loss/crossentropy": 2.423929810523987, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.241975300014019, "step": 6662 }, { "epoch": 0.4165, "grad_norm": 2.515625, "grad_norm_var": 0.008199055989583334, "learning_rate": 0.0001, "loss": 7.5831, "loss/crossentropy": 2.3132593631744385, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.21304812282323837, "step": 6664 }, { "epoch": 0.416625, "grad_norm": 2.578125, "grad_norm_var": 0.007323201497395833, "learning_rate": 0.0001, "loss": 7.431, "loss/crossentropy": 2.2098920345306396, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23346205055713654, "step": 6666 }, { "epoch": 0.41675, "grad_norm": 2.578125, "grad_norm_var": 0.008349609375, "learning_rate": 0.0001, "loss": 7.4496, "loss/crossentropy": 2.322250247001648, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22749358415603638, "step": 6668 }, { "epoch": 0.416875, "grad_norm": 2.6875, "grad_norm_var": 0.005475870768229167, "learning_rate": 0.0001, "loss": 7.5849, "loss/crossentropy": 2.419236898422241, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2175421565771103, "step": 6670 }, { "epoch": 0.417, "grad_norm": 2.796875, "grad_norm_var": 0.008561197916666667, "learning_rate": 0.0001, "loss": 7.7348, "loss/crossentropy": 2.3078067302703857, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2318611890077591, "step": 6672 }, { "epoch": 0.417125, "grad_norm": 2.734375, "grad_norm_var": 0.00933837890625, "learning_rate": 0.0001, "loss": 7.7137, "loss/crossentropy": 2.40431547164917, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.2437881901860237, "step": 6674 }, { "epoch": 0.41725, "grad_norm": 2.453125, "grad_norm_var": 0.010432942708333334, "learning_rate": 0.0001, "loss": 7.7133, "loss/crossentropy": 2.5853854417800903, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2550952136516571, "step": 6676 }, { "epoch": 0.417375, "grad_norm": 2.640625, "grad_norm_var": 0.014850870768229166, "learning_rate": 0.0001, "loss": 7.5186, "loss/crossentropy": 2.496149778366089, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22854652255773544, "step": 6678 }, { "epoch": 0.4175, "grad_norm": 2.625, "grad_norm_var": 0.01826171875, "learning_rate": 0.0001, "loss": 7.635, "loss/crossentropy": 2.1969287395477295, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23653262853622437, "step": 6680 }, { "epoch": 0.417625, "grad_norm": 2.5625, "grad_norm_var": 0.020978800455729165, "learning_rate": 0.0001, "loss": 7.5695, "loss/crossentropy": 2.1755484342575073, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2031865119934082, "step": 6682 }, { "epoch": 0.41775, "grad_norm": 2.5625, "grad_norm_var": 0.019010416666666665, "learning_rate": 0.0001, "loss": 7.6347, "loss/crossentropy": 2.3422329425811768, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23694248497486115, "step": 6684 }, { "epoch": 0.417875, "grad_norm": 2.453125, "grad_norm_var": 0.022489420572916665, "learning_rate": 0.0001, "loss": 7.3131, "loss/crossentropy": 1.996912956237793, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22247838228940964, "step": 6686 }, { "epoch": 0.418, "grad_norm": 2.5625, "grad_norm_var": 0.018815104166666666, "learning_rate": 0.0001, "loss": 7.5904, "loss/crossentropy": 2.2785043716430664, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24107830971479416, "step": 6688 }, { "epoch": 0.418125, "grad_norm": 2.5625, "grad_norm_var": 0.017992146809895835, "learning_rate": 0.0001, "loss": 7.4521, "loss/crossentropy": 2.234324097633362, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22417379170656204, "step": 6690 }, { "epoch": 0.41825, "grad_norm": 2.703125, "grad_norm_var": 0.01968994140625, "learning_rate": 0.0001, "loss": 7.4978, "loss/crossentropy": 2.310504913330078, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2348729521036148, "step": 6692 }, { "epoch": 0.418375, "grad_norm": 2.546875, "grad_norm_var": 0.014872233072916666, "learning_rate": 0.0001, "loss": 7.577, "loss/crossentropy": 2.237311005592346, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.24628063291311264, "step": 6694 }, { "epoch": 0.4185, "grad_norm": 2.65625, "grad_norm_var": 0.0114410400390625, "learning_rate": 0.0001, "loss": 7.4194, "loss/crossentropy": 2.2272292375564575, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.246670164167881, "step": 6696 }, { "epoch": 0.418625, "grad_norm": 2.59375, "grad_norm_var": 0.0084136962890625, "learning_rate": 0.0001, "loss": 7.6333, "loss/crossentropy": 2.3391493558883667, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23854172974824905, "step": 6698 }, { "epoch": 0.41875, "grad_norm": 2.390625, "grad_norm_var": 0.012458292643229167, "learning_rate": 0.0001, "loss": 7.6855, "loss/crossentropy": 2.3589494228363037, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22756918519735336, "step": 6700 }, { "epoch": 0.418875, "grad_norm": 2.84375, "grad_norm_var": 0.011473592122395833, "learning_rate": 0.0001, "loss": 7.6844, "loss/crossentropy": 2.1670289039611816, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2263951376080513, "step": 6702 }, { "epoch": 0.419, "grad_norm": 2.59375, "grad_norm_var": 0.012923177083333333, "learning_rate": 0.0001, "loss": 7.744, "loss/crossentropy": 2.3853793144226074, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2287297621369362, "step": 6704 }, { "epoch": 0.419125, "grad_norm": 2.5, "grad_norm_var": 0.013597615559895833, "learning_rate": 0.0001, "loss": 7.4781, "loss/crossentropy": 2.097750425338745, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.21928713470697403, "step": 6706 }, { "epoch": 0.41925, "grad_norm": 2.5, "grad_norm_var": 0.0150054931640625, "learning_rate": 0.0001, "loss": 7.5361, "loss/crossentropy": 2.3825796842575073, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2336169183254242, "step": 6708 }, { "epoch": 0.419375, "grad_norm": 2.765625, "grad_norm_var": 0.0163238525390625, "learning_rate": 0.0001, "loss": 7.3965, "loss/crossentropy": 2.1737263202667236, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.20721475780010223, "step": 6710 }, { "epoch": 0.4195, "grad_norm": 2.59375, "grad_norm_var": 0.016185506184895834, "learning_rate": 0.0001, "loss": 7.491, "loss/crossentropy": 2.1605879068374634, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22161926329135895, "step": 6712 }, { "epoch": 0.419625, "grad_norm": 2.40625, "grad_norm_var": 0.018700154622395833, "learning_rate": 0.0001, "loss": 7.5478, "loss/crossentropy": 2.135630488395691, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22485657781362534, "step": 6714 }, { "epoch": 0.41975, "grad_norm": 2.5625, "grad_norm_var": 0.0155914306640625, "learning_rate": 0.0001, "loss": 7.4895, "loss/crossentropy": 2.2189778089523315, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22554031759500504, "step": 6716 }, { "epoch": 0.419875, "grad_norm": 2.59375, "grad_norm_var": 0.010856119791666667, "learning_rate": 0.0001, "loss": 7.5328, "loss/crossentropy": 2.248936414718628, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23680781573057175, "step": 6718 }, { "epoch": 0.42, "grad_norm": 2.515625, "grad_norm_var": 0.0077473958333333336, "learning_rate": 0.0001, "loss": 7.0591, "loss/crossentropy": 2.139213502407074, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2161421999335289, "step": 6720 }, { "epoch": 0.420125, "grad_norm": 2.421875, "grad_norm_var": 0.009537760416666667, "learning_rate": 0.0001, "loss": 7.6908, "loss/crossentropy": 2.26357901096344, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22218022495508194, "step": 6722 }, { "epoch": 0.42025, "grad_norm": 2.796875, "grad_norm_var": 0.014290364583333333, "learning_rate": 0.0001, "loss": 7.4018, "loss/crossentropy": 2.2201942205429077, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22970952093601227, "step": 6724 }, { "epoch": 0.420375, "grad_norm": 2.546875, "grad_norm_var": 0.011812337239583333, "learning_rate": 0.0001, "loss": 7.5667, "loss/crossentropy": 2.3013638257980347, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22569818049669266, "step": 6726 }, { "epoch": 0.4205, "grad_norm": 2.4375, "grad_norm_var": 0.013232421875, "learning_rate": 0.0001, "loss": 7.5204, "loss/crossentropy": 2.168237566947937, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23707735538482666, "step": 6728 }, { "epoch": 0.420625, "grad_norm": 2.640625, "grad_norm_var": 0.012938435872395833, "learning_rate": 0.0001, "loss": 7.6375, "loss/crossentropy": 2.5784939527511597, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23290014266967773, "step": 6730 }, { "epoch": 0.42075, "grad_norm": 2.484375, "grad_norm_var": 0.012528483072916667, "learning_rate": 0.0001, "loss": 7.4494, "loss/crossentropy": 1.9907370805740356, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.19960252195596695, "step": 6732 }, { "epoch": 0.420875, "grad_norm": 2.703125, "grad_norm_var": 0.013841756184895833, "learning_rate": 0.0001, "loss": 7.4412, "loss/crossentropy": 2.3281620740890503, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22964958101511002, "step": 6734 }, { "epoch": 0.421, "grad_norm": 2.859375, "grad_norm_var": 0.01636962890625, "learning_rate": 0.0001, "loss": 7.4611, "loss/crossentropy": 2.1842572689056396, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22636514902114868, "step": 6736 }, { "epoch": 0.421125, "grad_norm": 2.703125, "grad_norm_var": 0.016630045572916665, "learning_rate": 0.0001, "loss": 7.5901, "loss/crossentropy": 2.3323662281036377, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.21889524161815643, "step": 6738 }, { "epoch": 0.42125, "grad_norm": 2.578125, "grad_norm_var": 0.011839803059895833, "learning_rate": 0.0001, "loss": 7.4351, "loss/crossentropy": 2.0911742448806763, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21254166215658188, "step": 6740 }, { "epoch": 0.421375, "grad_norm": 2.53125, "grad_norm_var": 0.01265869140625, "learning_rate": 0.0001, "loss": 7.6039, "loss/crossentropy": 2.2947897911071777, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22886648774147034, "step": 6742 }, { "epoch": 0.4215, "grad_norm": 2.578125, "grad_norm_var": 0.0106353759765625, "learning_rate": 0.0001, "loss": 7.64, "loss/crossentropy": 2.292522430419922, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23117417842149734, "step": 6744 }, { "epoch": 0.421625, "grad_norm": 2.90625, "grad_norm_var": 0.015705362955729166, "learning_rate": 0.0001, "loss": 7.5929, "loss/crossentropy": 2.1165157556533813, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.20281828194856644, "step": 6746 }, { "epoch": 0.42175, "grad_norm": 3.0625, "grad_norm_var": 0.027099609375, "learning_rate": 0.0001, "loss": 7.5633, "loss/crossentropy": 2.3402230739593506, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22375594824552536, "step": 6748 }, { "epoch": 0.421875, "grad_norm": 2.515625, "grad_norm_var": 0.028531901041666665, "learning_rate": 0.0001, "loss": 7.5224, "loss/crossentropy": 2.45228374004364, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24891019612550735, "step": 6750 }, { "epoch": 0.422, "grad_norm": 2.984375, "grad_norm_var": 0.034886678059895836, "learning_rate": 0.0001, "loss": 7.4064, "loss/crossentropy": 2.2388226985931396, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2258487194776535, "step": 6752 }, { "epoch": 0.422125, "grad_norm": 2.453125, "grad_norm_var": 0.03532613118489583, "learning_rate": 0.0001, "loss": 7.5003, "loss/crossentropy": 2.543209671974182, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.229293555021286, "step": 6754 }, { "epoch": 0.42225, "grad_norm": 2.625, "grad_norm_var": 0.034821573893229166, "learning_rate": 0.0001, "loss": 7.6449, "loss/crossentropy": 2.175012469291687, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22282515466213226, "step": 6756 }, { "epoch": 0.422375, "grad_norm": 2.84375, "grad_norm_var": 0.037939453125, "learning_rate": 0.0001, "loss": 7.4797, "loss/crossentropy": 2.0612571239471436, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20665577054023743, "step": 6758 }, { "epoch": 0.4225, "grad_norm": 2.546875, "grad_norm_var": 0.043512980143229164, "learning_rate": 0.0001, "loss": 7.583, "loss/crossentropy": 2.405134081840515, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2416313737630844, "step": 6760 }, { "epoch": 0.422625, "grad_norm": 2.609375, "grad_norm_var": 0.03986002604166667, "learning_rate": 0.0001, "loss": 7.5283, "loss/crossentropy": 2.3742516040802, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2333827167749405, "step": 6762 }, { "epoch": 0.42275, "grad_norm": 2.8125, "grad_norm_var": 0.02783203125, "learning_rate": 0.0001, "loss": 7.6398, "loss/crossentropy": 2.137854218482971, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2170201912522316, "step": 6764 }, { "epoch": 0.422875, "grad_norm": 2.78125, "grad_norm_var": 0.0645416259765625, "learning_rate": 0.0001, "loss": 7.4708, "loss/crossentropy": 2.369171142578125, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23100775480270386, "step": 6766 }, { "epoch": 0.423, "grad_norm": 3.078125, "grad_norm_var": 0.0660064697265625, "learning_rate": 0.0001, "loss": 7.5856, "loss/crossentropy": 2.2748684883117676, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2369702011346817, "step": 6768 }, { "epoch": 0.423125, "grad_norm": 2.484375, "grad_norm_var": 0.06510416666666667, "learning_rate": 0.0001, "loss": 7.5607, "loss/crossentropy": 2.2452908754348755, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2098950892686844, "step": 6770 }, { "epoch": 0.42325, "grad_norm": 2.625, "grad_norm_var": 0.06784566243489583, "learning_rate": 0.0001, "loss": 7.2875, "loss/crossentropy": 2.23084557056427, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23322972655296326, "step": 6772 }, { "epoch": 0.423375, "grad_norm": 2.6875, "grad_norm_var": 0.06519266764322916, "learning_rate": 0.0001, "loss": 7.407, "loss/crossentropy": 2.0073655247688293, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.217587448656559, "step": 6774 }, { "epoch": 0.4235, "grad_norm": 2.484375, "grad_norm_var": 0.05953369140625, "learning_rate": 0.0001, "loss": 7.494, "loss/crossentropy": 2.482353448867798, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24201367795467377, "step": 6776 }, { "epoch": 0.423625, "grad_norm": 2.375, "grad_norm_var": 0.0637115478515625, "learning_rate": 0.0001, "loss": 7.496, "loss/crossentropy": 2.4793208837509155, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24220801144838333, "step": 6778 }, { "epoch": 0.42375, "grad_norm": 2.484375, "grad_norm_var": 0.06653238932291666, "learning_rate": 0.0001, "loss": 7.4885, "loss/crossentropy": 2.157904624938965, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2247818186879158, "step": 6780 }, { "epoch": 0.423875, "grad_norm": 2.53125, "grad_norm_var": 0.026106770833333334, "learning_rate": 0.0001, "loss": 7.2652, "loss/crossentropy": 2.0585808753967285, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2114100679755211, "step": 6782 }, { "epoch": 0.424, "grad_norm": 2.796875, "grad_norm_var": 0.012726847330729167, "learning_rate": 0.0001, "loss": 7.7329, "loss/crossentropy": 2.2323436737060547, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21502574533224106, "step": 6784 }, { "epoch": 0.424125, "grad_norm": 2.75, "grad_norm_var": 0.014742024739583333, "learning_rate": 0.0001, "loss": 7.5767, "loss/crossentropy": 2.3387337923049927, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22629782557487488, "step": 6786 }, { "epoch": 0.42425, "grad_norm": 2.4375, "grad_norm_var": 0.015754191080729167, "learning_rate": 0.0001, "loss": 7.6369, "loss/crossentropy": 2.4327272176742554, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24523190408945084, "step": 6788 }, { "epoch": 0.424375, "grad_norm": 2.53125, "grad_norm_var": 0.015718587239583335, "learning_rate": 0.0001, "loss": 7.6679, "loss/crossentropy": 2.442686080932617, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2262338101863861, "step": 6790 }, { "epoch": 0.4245, "grad_norm": 2.453125, "grad_norm_var": 0.015576171875, "learning_rate": 0.0001, "loss": 7.6008, "loss/crossentropy": 2.069571375846863, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.21501672267913818, "step": 6792 }, { "epoch": 0.424625, "grad_norm": 2.609375, "grad_norm_var": 0.012938435872395833, "learning_rate": 0.0001, "loss": 7.6962, "loss/crossentropy": 2.460999369621277, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24496762454509735, "step": 6794 }, { "epoch": 0.42475, "grad_norm": 2.609375, "grad_norm_var": 0.0182037353515625, "learning_rate": 0.0001, "loss": 7.8102, "loss/crossentropy": 2.211298942565918, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22223854809999466, "step": 6796 }, { "epoch": 0.424875, "grad_norm": 2.96875, "grad_norm_var": 0.02388916015625, "learning_rate": 0.0001, "loss": 7.5723, "loss/crossentropy": 2.3226964473724365, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.23604433238506317, "step": 6798 }, { "epoch": 0.425, "grad_norm": 2.59375, "grad_norm_var": 0.025544230143229166, "learning_rate": 0.0001, "loss": 7.6116, "loss/crossentropy": 2.3107694387435913, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2311045154929161, "step": 6800 }, { "epoch": 0.425125, "grad_norm": 3.203125, "grad_norm_var": 0.04322509765625, "learning_rate": 0.0001, "loss": 7.6629, "loss/crossentropy": 2.3348811864852905, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2444801777601242, "step": 6802 }, { "epoch": 0.42525, "grad_norm": 2.765625, "grad_norm_var": 0.0395172119140625, "learning_rate": 0.0001, "loss": 7.615, "loss/crossentropy": 2.3033541440963745, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24458616226911545, "step": 6804 }, { "epoch": 0.425375, "grad_norm": 2.6875, "grad_norm_var": 0.037984212239583336, "learning_rate": 0.0001, "loss": 7.5452, "loss/crossentropy": 2.0454084277153015, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22996855527162552, "step": 6806 }, { "epoch": 0.4255, "grad_norm": 2.703125, "grad_norm_var": 0.030370076497395832, "learning_rate": 0.0001, "loss": 7.7127, "loss/crossentropy": 2.2061938047409058, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2557276338338852, "step": 6808 }, { "epoch": 0.425625, "grad_norm": 2.453125, "grad_norm_var": 0.03406473795572917, "learning_rate": 0.0001, "loss": 7.645, "loss/crossentropy": 2.1404430866241455, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23241303116083145, "step": 6810 }, { "epoch": 0.42575, "grad_norm": 2.640625, "grad_norm_var": 0.032380167643229166, "learning_rate": 0.0001, "loss": 7.5111, "loss/crossentropy": 2.08138507604599, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.20700395107269287, "step": 6812 }, { "epoch": 0.425875, "grad_norm": 2.796875, "grad_norm_var": 0.028987630208333334, "learning_rate": 0.0001, "loss": 7.5094, "loss/crossentropy": 2.1254957914352417, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2146979495882988, "step": 6814 }, { "epoch": 0.426, "grad_norm": 2.84375, "grad_norm_var": 0.0282623291015625, "learning_rate": 0.0001, "loss": 7.6562, "loss/crossentropy": 2.3065346479415894, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2218923643231392, "step": 6816 }, { "epoch": 0.426125, "grad_norm": 2.53125, "grad_norm_var": 0.015283203125, "learning_rate": 0.0001, "loss": 7.5499, "loss/crossentropy": 2.2197200059890747, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22439652681350708, "step": 6818 }, { "epoch": 0.42625, "grad_norm": 2.421875, "grad_norm_var": 0.017041015625, "learning_rate": 0.0001, "loss": 7.4674, "loss/crossentropy": 2.3785969018936157, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22880811989307404, "step": 6820 }, { "epoch": 0.426375, "grad_norm": 2.5625, "grad_norm_var": 0.017267862955729168, "learning_rate": 0.0001, "loss": 7.5037, "loss/crossentropy": 2.2370119094848633, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22262324392795563, "step": 6822 }, { "epoch": 0.4265, "grad_norm": 2.546875, "grad_norm_var": 0.01666259765625, "learning_rate": 0.0001, "loss": 7.6871, "loss/crossentropy": 2.3089643716812134, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23920578509569168, "step": 6824 }, { "epoch": 0.426625, "grad_norm": 2.5, "grad_norm_var": 0.018473307291666668, "learning_rate": 0.0001, "loss": 7.4618, "loss/crossentropy": 2.2593663930892944, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2296246439218521, "step": 6826 }, { "epoch": 0.42675, "grad_norm": 2.625, "grad_norm_var": 0.019840494791666666, "learning_rate": 0.0001, "loss": 7.5933, "loss/crossentropy": 2.2942166328430176, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22174550592899323, "step": 6828 }, { "epoch": 0.426875, "grad_norm": 2.6875, "grad_norm_var": 0.0181640625, "learning_rate": 0.0001, "loss": 7.4497, "loss/crossentropy": 1.97617906332016, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.2070036381483078, "step": 6830 }, { "epoch": 0.427, "grad_norm": 2.578125, "grad_norm_var": 0.015119425455729167, "learning_rate": 0.0001, "loss": 7.4723, "loss/crossentropy": 2.2410775423049927, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22592800855636597, "step": 6832 }, { "epoch": 0.427125, "grad_norm": 2.4375, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 7.5358, "loss/crossentropy": 2.2140880823135376, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2219024896621704, "step": 6834 }, { "epoch": 0.42725, "grad_norm": 2.625, "grad_norm_var": 0.013313802083333333, "learning_rate": 0.0001, "loss": 7.67, "loss/crossentropy": 2.3348313570022583, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22434741258621216, "step": 6836 }, { "epoch": 0.427375, "grad_norm": 2.671875, "grad_norm_var": 0.013605753580729166, "learning_rate": 0.0001, "loss": 7.7384, "loss/crossentropy": 2.187626600265503, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22358687222003937, "step": 6838 }, { "epoch": 0.4275, "grad_norm": 2.59375, "grad_norm_var": 0.015208943684895834, "learning_rate": 0.0001, "loss": 7.3899, "loss/crossentropy": 2.1965412497520447, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2108786478638649, "step": 6840 }, { "epoch": 0.427625, "grad_norm": 2.53125, "grad_norm_var": 0.010619099934895833, "learning_rate": 0.0001, "loss": 7.5092, "loss/crossentropy": 2.1788800954818726, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21197441220283508, "step": 6842 }, { "epoch": 0.42775, "grad_norm": 2.5625, "grad_norm_var": 0.01070556640625, "learning_rate": 0.0001, "loss": 7.5364, "loss/crossentropy": 2.344847083091736, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2121613845229149, "step": 6844 }, { "epoch": 0.427875, "grad_norm": 2.59375, "grad_norm_var": 0.01510009765625, "learning_rate": 0.0001, "loss": 7.6922, "loss/crossentropy": 2.281248092651367, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23826022446155548, "step": 6846 }, { "epoch": 0.428, "grad_norm": 2.421875, "grad_norm_var": 0.0187164306640625, "learning_rate": 0.0001, "loss": 7.4142, "loss/crossentropy": 2.13250470161438, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20225460082292557, "step": 6848 }, { "epoch": 0.428125, "grad_norm": 2.65625, "grad_norm_var": 0.014676920572916667, "learning_rate": 0.0001, "loss": 7.5326, "loss/crossentropy": 2.130977749824524, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22926100343465805, "step": 6850 }, { "epoch": 0.42825, "grad_norm": 2.546875, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 7.3658, "loss/crossentropy": 2.2894575595855713, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2376842200756073, "step": 6852 }, { "epoch": 0.428375, "grad_norm": 2.609375, "grad_norm_var": 0.013374837239583333, "learning_rate": 0.0001, "loss": 7.4424, "loss/crossentropy": 2.15961492061615, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22177959978580475, "step": 6854 }, { "epoch": 0.4285, "grad_norm": 2.609375, "grad_norm_var": 0.0124908447265625, "learning_rate": 0.0001, "loss": 7.3789, "loss/crossentropy": 2.11038601398468, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22574742883443832, "step": 6856 }, { "epoch": 0.428625, "grad_norm": 2.53125, "grad_norm_var": 0.012984212239583333, "learning_rate": 0.0001, "loss": 7.4624, "loss/crossentropy": 2.3959596157073975, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2288554161787033, "step": 6858 }, { "epoch": 0.42875, "grad_norm": 2.640625, "grad_norm_var": 0.013426717122395833, "learning_rate": 0.0001, "loss": 7.5736, "loss/crossentropy": 2.3469239473342896, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2149468958377838, "step": 6860 }, { "epoch": 0.428875, "grad_norm": 2.640625, "grad_norm_var": 0.0088531494140625, "learning_rate": 0.0001, "loss": 7.6086, "loss/crossentropy": 2.385411858558655, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2368868589401245, "step": 6862 }, { "epoch": 0.429, "grad_norm": 2.640625, "grad_norm_var": 0.0057942708333333336, "learning_rate": 0.0001, "loss": 7.5762, "loss/crossentropy": 2.2060033082962036, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22091632336378098, "step": 6864 }, { "epoch": 0.429125, "grad_norm": 2.71875, "grad_norm_var": 0.005777994791666667, "learning_rate": 0.0001, "loss": 7.6356, "loss/crossentropy": 2.3089367151260376, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2498275339603424, "step": 6866 }, { "epoch": 0.42925, "grad_norm": 2.390625, "grad_norm_var": 0.007405598958333333, "learning_rate": 0.0001, "loss": 7.4189, "loss/crossentropy": 2.243834137916565, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2133392095565796, "step": 6868 }, { "epoch": 0.429375, "grad_norm": 2.84375, "grad_norm_var": 0.012788899739583333, "learning_rate": 0.0001, "loss": 7.4553, "loss/crossentropy": 2.4016648530960083, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22162681818008423, "step": 6870 }, { "epoch": 0.4295, "grad_norm": 2.59375, "grad_norm_var": 0.0183990478515625, "learning_rate": 0.0001, "loss": 7.4954, "loss/crossentropy": 2.3029072284698486, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21795833855867386, "step": 6872 }, { "epoch": 0.429625, "grad_norm": 2.515625, "grad_norm_var": 0.018994140625, "learning_rate": 0.0001, "loss": 7.411, "loss/crossentropy": 2.222460627555847, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2248850017786026, "step": 6874 }, { "epoch": 0.42975, "grad_norm": 2.5625, "grad_norm_var": 0.0178619384765625, "learning_rate": 0.0001, "loss": 7.4801, "loss/crossentropy": 2.1485586762428284, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.20181532949209213, "step": 6876 }, { "epoch": 0.429875, "grad_norm": 2.625, "grad_norm_var": 0.018587239583333335, "learning_rate": 0.0001, "loss": 7.5921, "loss/crossentropy": 2.232265830039978, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22481940686702728, "step": 6878 }, { "epoch": 0.43, "grad_norm": 2.6875, "grad_norm_var": 0.01939697265625, "learning_rate": 0.0001, "loss": 7.4812, "loss/crossentropy": 2.2070114612579346, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2424803152680397, "step": 6880 }, { "epoch": 0.430125, "grad_norm": 2.484375, "grad_norm_var": 0.020124308268229165, "learning_rate": 0.0001, "loss": 7.5596, "loss/crossentropy": 2.3880205154418945, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21555181592702866, "step": 6882 }, { "epoch": 0.43025, "grad_norm": 2.515625, "grad_norm_var": 0.017997233072916667, "learning_rate": 0.0001, "loss": 7.6153, "loss/crossentropy": 2.3800392150878906, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22199943661689758, "step": 6884 }, { "epoch": 0.430375, "grad_norm": 3.140625, "grad_norm_var": 0.030598958333333332, "learning_rate": 0.0001, "loss": 7.6735, "loss/crossentropy": 2.2968798875808716, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22405706346035004, "step": 6886 }, { "epoch": 0.4305, "grad_norm": 2.671875, "grad_norm_var": 0.025516764322916666, "learning_rate": 0.0001, "loss": 7.5856, "loss/crossentropy": 2.2794270515441895, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.27198177576065063, "step": 6888 }, { "epoch": 0.430625, "grad_norm": 2.609375, "grad_norm_var": 0.024706013997395835, "learning_rate": 0.0001, "loss": 7.684, "loss/crossentropy": 2.4670194387435913, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2296854704618454, "step": 6890 }, { "epoch": 0.43075, "grad_norm": 2.421875, "grad_norm_var": 0.028791300455729165, "learning_rate": 0.0001, "loss": 7.3849, "loss/crossentropy": 2.32903790473938, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.20303255319595337, "step": 6892 }, { "epoch": 0.430875, "grad_norm": 2.453125, "grad_norm_var": 0.034056599934895834, "learning_rate": 0.0001, "loss": 7.3481, "loss/crossentropy": 2.3374183177948, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22310950607061386, "step": 6894 }, { "epoch": 0.431, "grad_norm": 2.4375, "grad_norm_var": 0.03381754557291667, "learning_rate": 0.0001, "loss": 7.5074, "loss/crossentropy": 2.249111294746399, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2400292605161667, "step": 6896 }, { "epoch": 0.431125, "grad_norm": 2.5, "grad_norm_var": 0.0336334228515625, "learning_rate": 0.0001, "loss": 7.4589, "loss/crossentropy": 2.156920313835144, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21415946632623672, "step": 6898 }, { "epoch": 0.43125, "grad_norm": 2.671875, "grad_norm_var": 0.03292643229166667, "learning_rate": 0.0001, "loss": 7.3625, "loss/crossentropy": 2.1776874661445618, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2244596853852272, "step": 6900 }, { "epoch": 0.431375, "grad_norm": 2.5, "grad_norm_var": 0.008138020833333334, "learning_rate": 0.0001, "loss": 7.2768, "loss/crossentropy": 1.9912413358688354, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2006819173693657, "step": 6902 }, { "epoch": 0.4315, "grad_norm": 2.609375, "grad_norm_var": 0.006403605143229167, "learning_rate": 0.0001, "loss": 7.5539, "loss/crossentropy": 2.323360800743103, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23047194629907608, "step": 6904 }, { "epoch": 0.431625, "grad_norm": 2.5625, "grad_norm_var": 0.009137980143229167, "learning_rate": 0.0001, "loss": 7.5008, "loss/crossentropy": 2.364463686943054, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22141238301992416, "step": 6906 }, { "epoch": 0.43175, "grad_norm": 2.546875, "grad_norm_var": 0.008470662434895833, "learning_rate": 0.0001, "loss": 7.5543, "loss/crossentropy": 2.5402129888534546, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.24208682775497437, "step": 6908 }, { "epoch": 0.431875, "grad_norm": 2.546875, "grad_norm_var": 0.0058553059895833336, "learning_rate": 0.0001, "loss": 7.2773, "loss/crossentropy": 2.426114797592163, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23964547365903854, "step": 6910 }, { "epoch": 0.432, "grad_norm": 2.765625, "grad_norm_var": 0.008014933268229166, "learning_rate": 0.0001, "loss": 7.6238, "loss/crossentropy": 2.3094054460525513, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2353430762887001, "step": 6912 }, { "epoch": 0.432125, "grad_norm": 2.671875, "grad_norm_var": 0.008349609375, "learning_rate": 0.0001, "loss": 7.5314, "loss/crossentropy": 2.2929235696792603, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2039901688694954, "step": 6914 }, { "epoch": 0.43225, "grad_norm": 2.453125, "grad_norm_var": 0.008756510416666667, "learning_rate": 0.0001, "loss": 7.3504, "loss/crossentropy": 2.345077157020569, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22398847341537476, "step": 6916 }, { "epoch": 0.432375, "grad_norm": 2.71875, "grad_norm_var": 0.0094390869140625, "learning_rate": 0.0001, "loss": 7.55, "loss/crossentropy": 2.2170501947402954, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21256496012210846, "step": 6918 }, { "epoch": 0.4325, "grad_norm": 2.78125, "grad_norm_var": 0.011442057291666667, "learning_rate": 0.0001, "loss": 7.4107, "loss/crossentropy": 2.291950821876526, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22643620520830154, "step": 6920 }, { "epoch": 0.432625, "grad_norm": 2.53125, "grad_norm_var": 0.011649576822916667, "learning_rate": 0.0001, "loss": 7.6607, "loss/crossentropy": 2.150121331214905, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2236119732260704, "step": 6922 }, { "epoch": 0.43275, "grad_norm": 2.546875, "grad_norm_var": 0.011677042643229166, "learning_rate": 0.0001, "loss": 7.8228, "loss/crossentropy": 2.438883662223816, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.2510446310043335, "step": 6924 }, { "epoch": 0.432875, "grad_norm": 2.8125, "grad_norm_var": 0.04439188639322917, "learning_rate": 0.0001, "loss": 7.7306, "loss/crossentropy": 2.370510697364807, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.29264724254608154, "step": 6926 }, { "epoch": 0.433, "grad_norm": 2.484375, "grad_norm_var": 0.045075480143229166, "learning_rate": 0.0001, "loss": 7.4752, "loss/crossentropy": 2.18233585357666, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23772528022527695, "step": 6928 }, { "epoch": 0.433125, "grad_norm": 2.671875, "grad_norm_var": 0.046052042643229166, "learning_rate": 0.0001, "loss": 7.668, "loss/crossentropy": 2.494891405105591, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2504590004682541, "step": 6930 }, { "epoch": 0.43325, "grad_norm": 2.703125, "grad_norm_var": 0.07672119140625, "learning_rate": 0.0001, "loss": 7.592, "loss/crossentropy": 2.122566342353821, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2392774149775505, "step": 6932 }, { "epoch": 0.433375, "grad_norm": 2.75, "grad_norm_var": 0.07491861979166667, "learning_rate": 0.0001, "loss": 7.634, "loss/crossentropy": 2.5051095485687256, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22540390491485596, "step": 6934 }, { "epoch": 0.4335, "grad_norm": 3.0, "grad_norm_var": 0.07903544108072917, "learning_rate": 0.0001, "loss": 7.6147, "loss/crossentropy": 2.4519747495651245, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24915387481451035, "step": 6936 }, { "epoch": 0.433625, "grad_norm": 3.1875, "grad_norm_var": 0.08785807291666667, "learning_rate": 0.0001, "loss": 7.6643, "loss/crossentropy": 2.3023834228515625, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2158026248216629, "step": 6938 }, { "epoch": 0.43375, "grad_norm": 2.78125, "grad_norm_var": 0.08146158854166667, "learning_rate": 0.0001, "loss": 7.7465, "loss/crossentropy": 2.394049286842346, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.26030176132917404, "step": 6940 }, { "epoch": 0.433875, "grad_norm": 2.65625, "grad_norm_var": 0.07146708170572917, "learning_rate": 0.0001, "loss": 7.6446, "loss/crossentropy": 2.113188326358795, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2042284458875656, "step": 6942 }, { "epoch": 0.434, "grad_norm": 2.5625, "grad_norm_var": 0.06424153645833333, "learning_rate": 0.0001, "loss": 7.4091, "loss/crossentropy": 2.407089948654175, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2415885180234909, "step": 6944 }, { "epoch": 0.434125, "grad_norm": 2.671875, "grad_norm_var": 0.06424153645833333, "learning_rate": 0.0001, "loss": 7.7385, "loss/crossentropy": 2.5629037618637085, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24821512401103973, "step": 6946 }, { "epoch": 0.43425, "grad_norm": 2.4375, "grad_norm_var": 0.04962565104166667, "learning_rate": 0.0001, "loss": 7.4078, "loss/crossentropy": 2.4120923280715942, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22190524637699127, "step": 6948 }, { "epoch": 0.434375, "grad_norm": 2.515625, "grad_norm_var": 0.05373433430989583, "learning_rate": 0.0001, "loss": 7.6087, "loss/crossentropy": 2.1990346908569336, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24148999154567719, "step": 6950 }, { "epoch": 0.4345, "grad_norm": 2.421875, "grad_norm_var": 0.052033487955729166, "learning_rate": 0.0001, "loss": 7.5092, "loss/crossentropy": 2.2445861101150513, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2081500142812729, "step": 6952 }, { "epoch": 0.434625, "grad_norm": 2.484375, "grad_norm_var": 0.029792277018229167, "learning_rate": 0.0001, "loss": 7.523, "loss/crossentropy": 2.526068925857544, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2585867792367935, "step": 6954 }, { "epoch": 0.43475, "grad_norm": 2.71875, "grad_norm_var": 0.0170806884765625, "learning_rate": 0.0001, "loss": 7.4422, "loss/crossentropy": 2.319674253463745, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2369941845536232, "step": 6956 }, { "epoch": 0.434875, "grad_norm": 2.390625, "grad_norm_var": 0.020052083333333335, "learning_rate": 0.0001, "loss": 7.2858, "loss/crossentropy": 2.032672941684723, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22597885131835938, "step": 6958 }, { "epoch": 0.435, "grad_norm": 2.65625, "grad_norm_var": 0.02027587890625, "learning_rate": 0.0001, "loss": 7.4503, "loss/crossentropy": 2.3739267587661743, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2178945243358612, "step": 6960 }, { "epoch": 0.435125, "grad_norm": 2.578125, "grad_norm_var": 0.0130859375, "learning_rate": 0.0001, "loss": 7.4552, "loss/crossentropy": 2.250732421875, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.26057247072458267, "step": 6962 }, { "epoch": 0.43525, "grad_norm": 2.59375, "grad_norm_var": 0.0200592041015625, "learning_rate": 0.0001, "loss": 7.566, "loss/crossentropy": 2.3947852849960327, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22966811805963516, "step": 6964 }, { "epoch": 0.435375, "grad_norm": 2.609375, "grad_norm_var": 0.0195953369140625, "learning_rate": 0.0001, "loss": 7.408, "loss/crossentropy": 2.0657782554626465, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21268612146377563, "step": 6966 }, { "epoch": 0.4355, "grad_norm": 2.59375, "grad_norm_var": 0.017194620768229165, "learning_rate": 0.0001, "loss": 7.7375, "loss/crossentropy": 2.441284656524658, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2511487305164337, "step": 6968 }, { "epoch": 0.435625, "grad_norm": 2.65625, "grad_norm_var": 0.03186442057291667, "learning_rate": 0.0001, "loss": 7.7483, "loss/crossentropy": 2.4622262716293335, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24444110691547394, "step": 6970 }, { "epoch": 0.43575, "grad_norm": 2.578125, "grad_norm_var": 0.031183878580729168, "learning_rate": 0.0001, "loss": 7.3592, "loss/crossentropy": 2.1688013076782227, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.23788545280694962, "step": 6972 }, { "epoch": 0.435875, "grad_norm": 2.609375, "grad_norm_var": 0.0272125244140625, "learning_rate": 0.0001, "loss": 7.3486, "loss/crossentropy": 2.2716058492660522, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22088874131441116, "step": 6974 }, { "epoch": 0.436, "grad_norm": 2.46875, "grad_norm_var": 0.030729166666666665, "learning_rate": 0.0001, "loss": 7.5265, "loss/crossentropy": 2.331586241722107, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22634634375572205, "step": 6976 }, { "epoch": 0.436125, "grad_norm": 2.484375, "grad_norm_var": 0.03331705729166667, "learning_rate": 0.0001, "loss": 7.6346, "loss/crossentropy": 2.514127016067505, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2315809652209282, "step": 6978 }, { "epoch": 0.43625, "grad_norm": 2.390625, "grad_norm_var": 0.030492146809895832, "learning_rate": 0.0001, "loss": 7.2929, "loss/crossentropy": 2.2412805557250977, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22676674276590347, "step": 6980 }, { "epoch": 0.436375, "grad_norm": 2.578125, "grad_norm_var": 0.030427042643229166, "learning_rate": 0.0001, "loss": 7.5346, "loss/crossentropy": 2.0969573259353638, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2131844088435173, "step": 6982 }, { "epoch": 0.4365, "grad_norm": 2.921875, "grad_norm_var": 0.03785400390625, "learning_rate": 0.0001, "loss": 7.8178, "loss/crossentropy": 2.575149893760681, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.25528932362794876, "step": 6984 }, { "epoch": 0.436625, "grad_norm": 2.765625, "grad_norm_var": 0.021435546875, "learning_rate": 0.0001, "loss": 7.6106, "loss/crossentropy": 2.218082904815674, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.229249969124794, "step": 6986 }, { "epoch": 0.43675, "grad_norm": 2.484375, "grad_norm_var": 0.022163899739583333, "learning_rate": 0.0001, "loss": 7.4339, "loss/crossentropy": 2.3712974786758423, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22180631756782532, "step": 6988 }, { "epoch": 0.436875, "grad_norm": 2.46875, "grad_norm_var": 0.022972615559895833, "learning_rate": 0.0001, "loss": 7.5122, "loss/crossentropy": 2.1957250833511353, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2212134599685669, "step": 6990 }, { "epoch": 0.437, "grad_norm": 2.515625, "grad_norm_var": 0.0211334228515625, "learning_rate": 0.0001, "loss": 7.5077, "loss/crossentropy": 2.418782114982605, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2276376411318779, "step": 6992 }, { "epoch": 0.437125, "grad_norm": 2.671875, "grad_norm_var": 0.018903605143229165, "learning_rate": 0.0001, "loss": 7.3885, "loss/crossentropy": 2.2041863203048706, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.20726150274276733, "step": 6994 }, { "epoch": 0.43725, "grad_norm": 2.5, "grad_norm_var": 0.015819295247395834, "learning_rate": 0.0001, "loss": 7.5815, "loss/crossentropy": 2.095567524433136, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.19663628935813904, "step": 6996 }, { "epoch": 0.437375, "grad_norm": 2.59375, "grad_norm_var": 0.017350260416666666, "learning_rate": 0.0001, "loss": 7.3827, "loss/crossentropy": 2.1922199726104736, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22253280878067017, "step": 6998 }, { "epoch": 0.4375, "grad_norm": 2.640625, "grad_norm_var": 0.009032185872395833, "learning_rate": 0.0001, "loss": 7.4403, "loss/crossentropy": 2.2777546644210815, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.21848614513874054, "step": 7000 }, { "epoch": 0.437625, "grad_norm": 2.4375, "grad_norm_var": 0.009212239583333334, "learning_rate": 0.0001, "loss": 7.3364, "loss/crossentropy": 2.25589919090271, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23384109139442444, "step": 7002 }, { "epoch": 0.43775, "grad_norm": 2.9375, "grad_norm_var": 0.0322174072265625, "learning_rate": 0.0001, "loss": 7.6373, "loss/crossentropy": 2.0874950885772705, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21009615808725357, "step": 7004 }, { "epoch": 0.437875, "grad_norm": 2.375, "grad_norm_var": 0.03596089680989583, "learning_rate": 0.0001, "loss": 7.5382, "loss/crossentropy": 2.4048361778259277, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2398330420255661, "step": 7006 }, { "epoch": 0.438, "grad_norm": 2.484375, "grad_norm_var": 0.03625895182291667, "learning_rate": 0.0001, "loss": 7.3281, "loss/crossentropy": 2.1661869287490845, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22417958080768585, "step": 7008 }, { "epoch": 0.438125, "grad_norm": 3.015625, "grad_norm_var": 0.04631754557291667, "learning_rate": 0.0001, "loss": 7.2811, "loss/crossentropy": 2.257478713989258, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21776114404201508, "step": 7010 }, { "epoch": 0.43825, "grad_norm": 2.453125, "grad_norm_var": 0.04830729166666667, "learning_rate": 0.0001, "loss": 7.5795, "loss/crossentropy": 2.2374703884124756, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21662144362926483, "step": 7012 }, { "epoch": 0.438375, "grad_norm": 2.59375, "grad_norm_var": 0.04677632649739583, "learning_rate": 0.0001, "loss": 7.8624, "loss/crossentropy": 2.570241093635559, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.24564830213785172, "step": 7014 }, { "epoch": 0.4385, "grad_norm": 2.671875, "grad_norm_var": 0.04706624348958333, "learning_rate": 0.0001, "loss": 7.8729, "loss/crossentropy": 2.4280582666397095, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23895398527383804, "step": 7016 }, { "epoch": 0.438625, "grad_norm": 2.640625, "grad_norm_var": 0.04071858723958333, "learning_rate": 0.0001, "loss": 7.5618, "loss/crossentropy": 2.071039080619812, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21566591411828995, "step": 7018 }, { "epoch": 0.43875, "grad_norm": 2.546875, "grad_norm_var": 0.02310791015625, "learning_rate": 0.0001, "loss": 7.588, "loss/crossentropy": 2.2656747102737427, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22219014912843704, "step": 7020 }, { "epoch": 0.438875, "grad_norm": 2.484375, "grad_norm_var": 0.018879191080729166, "learning_rate": 0.0001, "loss": 7.3816, "loss/crossentropy": 2.1285390853881836, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2286635935306549, "step": 7022 }, { "epoch": 0.439, "grad_norm": 2.578125, "grad_norm_var": 0.018290201822916668, "learning_rate": 0.0001, "loss": 7.4221, "loss/crossentropy": 2.277518391609192, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22867325693368912, "step": 7024 }, { "epoch": 0.439125, "grad_norm": 2.484375, "grad_norm_var": 0.019938151041666668, "learning_rate": 0.0001, "loss": 7.5894, "loss/crossentropy": 2.2145551443099976, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23324239999055862, "step": 7026 }, { "epoch": 0.43925, "grad_norm": 2.609375, "grad_norm_var": 0.018138631184895834, "learning_rate": 0.0001, "loss": 7.5294, "loss/crossentropy": 2.1618987321853638, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2244369089603424, "step": 7028 }, { "epoch": 0.439375, "grad_norm": 2.546875, "grad_norm_var": 0.0184478759765625, "learning_rate": 0.0001, "loss": 7.5884, "loss/crossentropy": 2.3471587896347046, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.20863225311040878, "step": 7030 }, { "epoch": 0.4395, "grad_norm": 2.59375, "grad_norm_var": 0.0769195556640625, "learning_rate": 0.0001, "loss": 7.7253, "loss/crossentropy": 2.374652147293091, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23278672248125076, "step": 7032 }, { "epoch": 0.439625, "grad_norm": 2.625, "grad_norm_var": 0.07893473307291667, "learning_rate": 0.0001, "loss": 7.4704, "loss/crossentropy": 2.2305887937545776, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21743811666965485, "step": 7034 }, { "epoch": 0.43975, "grad_norm": 2.71875, "grad_norm_var": 0.07841389973958333, "learning_rate": 0.0001, "loss": 7.6484, "loss/crossentropy": 2.4501471519470215, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2342604100704193, "step": 7036 }, { "epoch": 0.439875, "grad_norm": 2.71875, "grad_norm_var": 0.07714436848958334, "learning_rate": 0.0001, "loss": 7.6575, "loss/crossentropy": 2.1478127241134644, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2170637845993042, "step": 7038 }, { "epoch": 0.44, "grad_norm": 2.53125, "grad_norm_var": 0.08111979166666666, "learning_rate": 0.0001, "loss": 7.4412, "loss/crossentropy": 2.2282705307006836, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21601247042417526, "step": 7040 }, { "epoch": 0.440125, "grad_norm": 2.765625, "grad_norm_var": 0.070703125, "learning_rate": 0.0001, "loss": 7.384, "loss/crossentropy": 2.206408977508545, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22640597820281982, "step": 7042 }, { "epoch": 0.44025, "grad_norm": 3.609375, "grad_norm_var": 0.12250874837239584, "learning_rate": 0.0001, "loss": 7.605, "loss/crossentropy": 2.3545809984207153, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23205594718456268, "step": 7044 }, { "epoch": 0.440375, "grad_norm": 2.5, "grad_norm_var": 0.12631734212239584, "learning_rate": 0.0001, "loss": 7.5055, "loss/crossentropy": 2.1955113410949707, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23007361590862274, "step": 7046 }, { "epoch": 0.4405, "grad_norm": 2.59375, "grad_norm_var": 0.07776285807291666, "learning_rate": 0.0001, "loss": 7.524, "loss/crossentropy": 2.378363013267517, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21818215399980545, "step": 7048 }, { "epoch": 0.440625, "grad_norm": 2.625, "grad_norm_var": 0.0815582275390625, "learning_rate": 0.0001, "loss": 7.5031, "loss/crossentropy": 2.202028751373291, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22879379987716675, "step": 7050 }, { "epoch": 0.44075, "grad_norm": 2.65625, "grad_norm_var": 0.08297119140625, "learning_rate": 0.0001, "loss": 7.5943, "loss/crossentropy": 2.2863932847976685, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22971680015325546, "step": 7052 }, { "epoch": 0.440875, "grad_norm": 3.34375, "grad_norm_var": 1.9725819905598958, "learning_rate": 0.0001, "loss": 7.77, "loss/crossentropy": 2.5123019218444824, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24296032637357712, "step": 7054 }, { "epoch": 0.441, "grad_norm": 2.609375, "grad_norm_var": 1.9467610677083333, "learning_rate": 0.0001, "loss": 7.57, "loss/crossentropy": 2.240562915802002, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2290712669491768, "step": 7056 }, { "epoch": 0.441125, "grad_norm": 2.875, "grad_norm_var": 1.9437784830729166, "learning_rate": 0.0001, "loss": 7.7937, "loss/crossentropy": 2.2735445499420166, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.24578536301851273, "step": 7058 }, { "epoch": 0.44125, "grad_norm": 2.578125, "grad_norm_var": 1.9438761393229167, "learning_rate": 0.0001, "loss": 7.5658, "loss/crossentropy": 2.13098806142807, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22668759524822235, "step": 7060 }, { "epoch": 0.441375, "grad_norm": 2.578125, "grad_norm_var": 1.9408355712890626, "learning_rate": 0.0001, "loss": 7.5376, "loss/crossentropy": 2.1684043407440186, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21858279407024384, "step": 7062 }, { "epoch": 0.4415, "grad_norm": 3.875, "grad_norm_var": 1.9723052978515625, "learning_rate": 0.0001, "loss": 7.608, "loss/crossentropy": 2.2704185247421265, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23027200996875763, "step": 7064 }, { "epoch": 0.441625, "grad_norm": 2.78125, "grad_norm_var": 1.9264882405598958, "learning_rate": 0.0001, "loss": 7.7046, "loss/crossentropy": 2.334232807159424, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2393873706459999, "step": 7066 }, { "epoch": 0.44175, "grad_norm": 2.640625, "grad_norm_var": 1.9120442708333334, "learning_rate": 0.0001, "loss": 7.4638, "loss/crossentropy": 2.1081438064575195, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.21786806732416153, "step": 7068 }, { "epoch": 0.441875, "grad_norm": 2.578125, "grad_norm_var": 0.11436258951822917, "learning_rate": 0.0001, "loss": 7.5484, "loss/crossentropy": 2.55453884601593, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24744783341884613, "step": 7070 }, { "epoch": 0.442, "grad_norm": 2.734375, "grad_norm_var": 0.1135894775390625, "learning_rate": 0.0001, "loss": 7.4961, "loss/crossentropy": 2.2864627838134766, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22755438089370728, "step": 7072 }, { "epoch": 0.442125, "grad_norm": 2.59375, "grad_norm_var": 0.12008056640625, "learning_rate": 0.0001, "loss": 7.3998, "loss/crossentropy": 2.090472996234894, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.21925938874483109, "step": 7074 }, { "epoch": 0.44225, "grad_norm": 2.546875, "grad_norm_var": 0.1263671875, "learning_rate": 0.0001, "loss": 7.4241, "loss/crossentropy": 2.207358717918396, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22418345510959625, "step": 7076 }, { "epoch": 0.442375, "grad_norm": 2.46875, "grad_norm_var": 0.11496480305989583, "learning_rate": 0.0001, "loss": 7.4274, "loss/crossentropy": 2.229384422302246, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22739236056804657, "step": 7078 }, { "epoch": 0.4425, "grad_norm": 3.203125, "grad_norm_var": 0.2537109375, "learning_rate": 0.0001, "loss": 7.5958, "loss/crossentropy": 2.269618034362793, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2671513184905052, "step": 7080 }, { "epoch": 0.442625, "grad_norm": 2.515625, "grad_norm_var": 0.2565388997395833, "learning_rate": 0.0001, "loss": 7.6417, "loss/crossentropy": 2.4567424058914185, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23014353215694427, "step": 7082 }, { "epoch": 0.44275, "grad_norm": 3.328125, "grad_norm_var": 0.27841389973958336, "learning_rate": 0.0001, "loss": 7.745, "loss/crossentropy": 2.1631439924240112, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24584712088108063, "step": 7084 }, { "epoch": 0.442875, "grad_norm": 2.84375, "grad_norm_var": 0.27514546712239585, "learning_rate": 0.0001, "loss": 8.0352, "loss/crossentropy": 2.550360918045044, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22847569733858109, "step": 7086 }, { "epoch": 0.443, "grad_norm": 2.6875, "grad_norm_var": 0.27323811848958335, "learning_rate": 0.0001, "loss": 7.6537, "loss/crossentropy": 2.2571427822113037, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2339075207710266, "step": 7088 }, { "epoch": 0.443125, "grad_norm": 2.8125, "grad_norm_var": 0.25982666015625, "learning_rate": 0.0001, "loss": 7.8242, "loss/crossentropy": 2.176592707633972, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23183703422546387, "step": 7090 }, { "epoch": 0.44325, "grad_norm": 2.796875, "grad_norm_var": 0.2625885009765625, "learning_rate": 0.0001, "loss": 7.4383, "loss/crossentropy": 2.1300759315490723, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.23205139487981796, "step": 7092 }, { "epoch": 0.443375, "grad_norm": 2.65625, "grad_norm_var": 0.24150390625, "learning_rate": 0.0001, "loss": 7.5725, "loss/crossentropy": 2.1009660959243774, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23328103870153427, "step": 7094 }, { "epoch": 0.4435, "grad_norm": 2.40625, "grad_norm_var": 0.07414449055989583, "learning_rate": 0.0001, "loss": 7.5588, "loss/crossentropy": 2.365289092063904, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2319376841187477, "step": 7096 }, { "epoch": 0.443625, "grad_norm": 2.515625, "grad_norm_var": 0.07727864583333334, "learning_rate": 0.0001, "loss": 7.7098, "loss/crossentropy": 2.1671372652053833, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21319618821144104, "step": 7098 }, { "epoch": 0.44375, "grad_norm": 3.015625, "grad_norm_var": 0.05845947265625, "learning_rate": 0.0001, "loss": 7.6182, "loss/crossentropy": 2.5064644813537598, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.27000120282173157, "step": 7100 }, { "epoch": 0.443875, "grad_norm": 2.515625, "grad_norm_var": 0.06336263020833334, "learning_rate": 0.0001, "loss": 7.5976, "loss/crossentropy": 2.3661924600601196, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2323109656572342, "step": 7102 }, { "epoch": 0.444, "grad_norm": 2.5625, "grad_norm_var": 0.06464436848958334, "learning_rate": 0.0001, "loss": 7.7929, "loss/crossentropy": 2.5370858907699585, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23859833925962448, "step": 7104 }, { "epoch": 0.444125, "grad_norm": 2.484375, "grad_norm_var": 0.06907145182291667, "learning_rate": 0.0001, "loss": 7.6318, "loss/crossentropy": 2.3168283700942993, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.20977229624986649, "step": 7106 }, { "epoch": 0.44425, "grad_norm": 2.65625, "grad_norm_var": 0.026854451497395834, "learning_rate": 0.0001, "loss": 7.7068, "loss/crossentropy": 2.276731491088867, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.26407164335250854, "step": 7108 }, { "epoch": 0.444375, "grad_norm": 2.640625, "grad_norm_var": 0.0256256103515625, "learning_rate": 0.0001, "loss": 7.5281, "loss/crossentropy": 2.226925849914551, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.20309847593307495, "step": 7110 }, { "epoch": 0.4445, "grad_norm": 2.734375, "grad_norm_var": 0.020799763997395835, "learning_rate": 0.0001, "loss": 7.6123, "loss/crossentropy": 2.0182350277900696, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22488389164209366, "step": 7112 }, { "epoch": 0.444625, "grad_norm": 2.890625, "grad_norm_var": 0.021141560872395833, "learning_rate": 0.0001, "loss": 7.3151, "loss/crossentropy": 2.305272102355957, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22948481142520905, "step": 7114 }, { "epoch": 0.44475, "grad_norm": 2.703125, "grad_norm_var": 0.0154296875, "learning_rate": 0.0001, "loss": 7.7247, "loss/crossentropy": 2.2902008295059204, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22371366620063782, "step": 7116 }, { "epoch": 0.444875, "grad_norm": 2.796875, "grad_norm_var": 0.0143463134765625, "learning_rate": 0.0001, "loss": 7.6547, "loss/crossentropy": 2.2686209678649902, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.25545573234558105, "step": 7118 }, { "epoch": 0.445, "grad_norm": 2.515625, "grad_norm_var": 0.015680948893229168, "learning_rate": 0.0001, "loss": 7.4213, "loss/crossentropy": 2.2258291244506836, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2323731780052185, "step": 7120 }, { "epoch": 0.445125, "grad_norm": 2.40625, "grad_norm_var": 0.017219034830729167, "learning_rate": 0.0001, "loss": 7.2783, "loss/crossentropy": 2.177391529083252, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2272118330001831, "step": 7122 }, { "epoch": 0.44525, "grad_norm": 2.734375, "grad_norm_var": 0.017313639322916668, "learning_rate": 0.0001, "loss": 7.6465, "loss/crossentropy": 2.5812461376190186, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23729535937309265, "step": 7124 }, { "epoch": 0.445375, "grad_norm": 2.515625, "grad_norm_var": 0.01923828125, "learning_rate": 0.0001, "loss": 7.5007, "loss/crossentropy": 2.279215455055237, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22710410505533218, "step": 7126 }, { "epoch": 0.4455, "grad_norm": 2.578125, "grad_norm_var": 0.017780558268229166, "learning_rate": 0.0001, "loss": 7.4811, "loss/crossentropy": 2.1434578895568848, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2134815901517868, "step": 7128 }, { "epoch": 0.445625, "grad_norm": 2.703125, "grad_norm_var": 0.0657379150390625, "learning_rate": 0.0001, "loss": 7.3584, "loss/crossentropy": 2.007891356945038, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22493670880794525, "step": 7130 }, { "epoch": 0.44575, "grad_norm": 2.53125, "grad_norm_var": 0.06568603515625, "learning_rate": 0.0001, "loss": 7.7317, "loss/crossentropy": 2.361915707588196, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22492821514606476, "step": 7132 }, { "epoch": 0.445875, "grad_norm": 2.4375, "grad_norm_var": 0.06682535807291666, "learning_rate": 0.0001, "loss": 7.5972, "loss/crossentropy": 2.442040801048279, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23443201929330826, "step": 7134 }, { "epoch": 0.446, "grad_norm": 2.96875, "grad_norm_var": 0.0756744384765625, "learning_rate": 0.0001, "loss": 7.6585, "loss/crossentropy": 2.3308969736099243, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.26816996932029724, "step": 7136 }, { "epoch": 0.446125, "grad_norm": 2.421875, "grad_norm_var": 0.07541910807291667, "learning_rate": 0.0001, "loss": 7.5335, "loss/crossentropy": 2.4410794973373413, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23623879998922348, "step": 7138 }, { "epoch": 0.44625, "grad_norm": 3.578125, "grad_norm_var": 0.23479410807291667, "learning_rate": 0.0001, "loss": 7.6097, "loss/crossentropy": 2.3236881494522095, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2385316640138626, "step": 7140 }, { "epoch": 0.446375, "grad_norm": 2.5625, "grad_norm_var": 0.22346598307291668, "learning_rate": 0.0001, "loss": 7.4261, "loss/crossentropy": 2.204988479614258, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21536948531866074, "step": 7142 }, { "epoch": 0.4465, "grad_norm": 2.46875, "grad_norm_var": 0.2281158447265625, "learning_rate": 0.0001, "loss": 7.4819, "loss/crossentropy": 2.2814563512802124, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22041501104831696, "step": 7144 }, { "epoch": 0.446625, "grad_norm": 2.796875, "grad_norm_var": 0.19472554524739583, "learning_rate": 0.0001, "loss": 7.657, "loss/crossentropy": 2.4272972345352173, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22483063489198685, "step": 7146 }, { "epoch": 0.44675, "grad_norm": 2.484375, "grad_norm_var": 0.19885660807291666, "learning_rate": 0.0001, "loss": 7.3515, "loss/crossentropy": 2.0474237203598022, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.20978070050477982, "step": 7148 }, { "epoch": 0.446875, "grad_norm": 2.71875, "grad_norm_var": 0.19039306640625, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.2677470445632935, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2215772196650505, "step": 7150 }, { "epoch": 0.447, "grad_norm": 2.515625, "grad_norm_var": 0.19412333170572918, "learning_rate": 0.0001, "loss": 7.6, "loss/crossentropy": 2.1507134437561035, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.23050490021705627, "step": 7152 }, { "epoch": 0.447125, "grad_norm": 2.4375, "grad_norm_var": 0.19595947265625, "learning_rate": 0.0001, "loss": 7.4009, "loss/crossentropy": 2.127189040184021, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2151520624756813, "step": 7154 }, { "epoch": 0.44725, "grad_norm": 2.625, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 7.7275, "loss/crossentropy": 2.2342575788497925, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22457968443632126, "step": 7156 }, { "epoch": 0.447375, "grad_norm": 6.625, "grad_norm_var": 1.1698201497395833, "learning_rate": 0.0001, "loss": 7.8055, "loss/crossentropy": 2.118963360786438, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25428134202957153, "step": 7158 }, { "epoch": 0.4475, "grad_norm": 2.796875, "grad_norm_var": 1.1612945556640626, "learning_rate": 0.0001, "loss": 7.8545, "loss/crossentropy": 2.2694531679153442, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24252331256866455, "step": 7160 }, { "epoch": 0.447625, "grad_norm": 2.65625, "grad_norm_var": 1.17369384765625, "learning_rate": 0.0001, "loss": 7.4611, "loss/crossentropy": 2.1240158677101135, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2107238620519638, "step": 7162 }, { "epoch": 0.44775, "grad_norm": 2.78125, "grad_norm_var": 1.1471638997395834, "learning_rate": 0.0001, "loss": 7.5175, "loss/crossentropy": 2.161839246749878, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.22657044231891632, "step": 7164 }, { "epoch": 0.447875, "grad_norm": 2.734375, "grad_norm_var": 1.146507771809896, "learning_rate": 0.0001, "loss": 7.849, "loss/crossentropy": 2.3722153902053833, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23769185692071915, "step": 7166 }, { "epoch": 0.448, "grad_norm": 3.9375, "grad_norm_var": 1.163654581705729, "learning_rate": 0.0001, "loss": 7.7199, "loss/crossentropy": 2.3477590084075928, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2469366490840912, "step": 7168 }, { "epoch": 0.448125, "grad_norm": 2.96875, "grad_norm_var": 1.0994954427083334, "learning_rate": 0.0001, "loss": 7.7189, "loss/crossentropy": 2.0979931950569153, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2308725118637085, "step": 7170 }, { "epoch": 0.44825, "grad_norm": 2.609375, "grad_norm_var": 1.10220947265625, "learning_rate": 0.0001, "loss": 7.4667, "loss/crossentropy": 1.9598948955535889, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21548035740852356, "step": 7172 }, { "epoch": 0.448375, "grad_norm": 2.609375, "grad_norm_var": 0.14980061848958334, "learning_rate": 0.0001, "loss": 7.6388, "loss/crossentropy": 2.2749656438827515, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2280566245317459, "step": 7174 }, { "epoch": 0.4485, "grad_norm": 2.515625, "grad_norm_var": 0.12552083333333333, "learning_rate": 0.0001, "loss": 7.5165, "loss/crossentropy": 2.2919198274612427, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22544564306735992, "step": 7176 }, { "epoch": 0.448625, "grad_norm": 2.609375, "grad_norm_var": 0.12390034993489583, "learning_rate": 0.0001, "loss": 7.5851, "loss/crossentropy": 2.208395838737488, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22555015981197357, "step": 7178 }, { "epoch": 0.44875, "grad_norm": 2.5625, "grad_norm_var": 0.12812093098958333, "learning_rate": 0.0001, "loss": 7.5168, "loss/crossentropy": 2.026079475879669, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2029392421245575, "step": 7180 }, { "epoch": 0.448875, "grad_norm": 2.8125, "grad_norm_var": 0.13262430826822916, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 2.3827956914901733, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2374342456459999, "step": 7182 }, { "epoch": 0.449, "grad_norm": 2.90625, "grad_norm_var": 0.09158528645833333, "learning_rate": 0.0001, "loss": 7.7397, "loss/crossentropy": 2.416311264038086, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.28401951491832733, "step": 7184 }, { "epoch": 0.449125, "grad_norm": 2.625, "grad_norm_var": 0.07877197265625, "learning_rate": 0.0001, "loss": 7.5488, "loss/crossentropy": 2.2222553491592407, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2201531007885933, "step": 7186 }, { "epoch": 0.44925, "grad_norm": 2.53125, "grad_norm_var": 0.08077799479166667, "learning_rate": 0.0001, "loss": 7.6088, "loss/crossentropy": 2.4663448333740234, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23039235919713974, "step": 7188 }, { "epoch": 0.449375, "grad_norm": 2.59375, "grad_norm_var": 0.08183186848958333, "learning_rate": 0.0001, "loss": 7.3937, "loss/crossentropy": 1.883381724357605, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22149814665317535, "step": 7190 }, { "epoch": 0.4495, "grad_norm": 2.640625, "grad_norm_var": 0.08212788899739583, "learning_rate": 0.0001, "loss": 7.6256, "loss/crossentropy": 2.4125300645828247, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22125422954559326, "step": 7192 }, { "epoch": 0.449625, "grad_norm": 2.640625, "grad_norm_var": 0.08129781087239583, "learning_rate": 0.0001, "loss": 7.3776, "loss/crossentropy": 2.156591773033142, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21974274516105652, "step": 7194 }, { "epoch": 0.44975, "grad_norm": 2.546875, "grad_norm_var": 0.0810943603515625, "learning_rate": 0.0001, "loss": 7.6307, "loss/crossentropy": 2.2000932693481445, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.24775639921426773, "step": 7196 }, { "epoch": 0.449875, "grad_norm": 2.59375, "grad_norm_var": 0.07349853515625, "learning_rate": 0.0001, "loss": 7.6432, "loss/crossentropy": 2.3233494758605957, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.24514156579971313, "step": 7198 }, { "epoch": 0.45, "grad_norm": 2.453125, "grad_norm_var": 0.006110636393229166, "learning_rate": 0.0001, "loss": 7.428, "loss/crossentropy": 2.5261600017547607, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23090995848178864, "step": 7200 }, { "epoch": 0.450125, "grad_norm": 2.34375, "grad_norm_var": 0.010090128580729166, "learning_rate": 0.0001, "loss": 7.2492, "loss/crossentropy": 2.1095964908599854, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20491775125265121, "step": 7202 }, { "epoch": 0.45025, "grad_norm": 2.453125, "grad_norm_var": 0.010445149739583333, "learning_rate": 0.0001, "loss": 7.5387, "loss/crossentropy": 2.0823811292648315, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20920012146234512, "step": 7204 }, { "epoch": 0.450375, "grad_norm": 2.578125, "grad_norm_var": 0.010282389322916667, "learning_rate": 0.0001, "loss": 7.5622, "loss/crossentropy": 2.3653724193573, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22337710857391357, "step": 7206 }, { "epoch": 0.4505, "grad_norm": 2.546875, "grad_norm_var": 0.013232421875, "learning_rate": 0.0001, "loss": 7.7664, "loss/crossentropy": 2.4810922145843506, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24244581162929535, "step": 7208 }, { "epoch": 0.450625, "grad_norm": 2.578125, "grad_norm_var": 0.011149088541666666, "learning_rate": 0.0001, "loss": 7.5231, "loss/crossentropy": 2.1666014194488525, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22514060139656067, "step": 7210 }, { "epoch": 0.45075, "grad_norm": 2.484375, "grad_norm_var": 0.011131795247395833, "learning_rate": 0.0001, "loss": 7.5948, "loss/crossentropy": 2.1480305194854736, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22327903658151627, "step": 7212 }, { "epoch": 0.450875, "grad_norm": 2.671875, "grad_norm_var": 0.011790974934895834, "learning_rate": 0.0001, "loss": 7.43, "loss/crossentropy": 2.414106249809265, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2344641387462616, "step": 7214 }, { "epoch": 0.451, "grad_norm": 2.5625, "grad_norm_var": 0.011457316080729167, "learning_rate": 0.0001, "loss": 7.2992, "loss/crossentropy": 2.049868106842041, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2359739914536476, "step": 7216 }, { "epoch": 0.451125, "grad_norm": 2.375, "grad_norm_var": 0.010152180989583334, "learning_rate": 0.0001, "loss": 7.2407, "loss/crossentropy": 1.8944953680038452, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.19045981764793396, "step": 7218 }, { "epoch": 0.45125, "grad_norm": 2.59375, "grad_norm_var": 0.010042317708333333, "learning_rate": 0.0001, "loss": 7.3649, "loss/crossentropy": 2.261385679244995, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.24006683379411697, "step": 7220 }, { "epoch": 0.451375, "grad_norm": 2.546875, "grad_norm_var": 0.01021728515625, "learning_rate": 0.0001, "loss": 7.6197, "loss/crossentropy": 2.2726603746414185, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.20860479027032852, "step": 7222 }, { "epoch": 0.4515, "grad_norm": 2.59375, "grad_norm_var": 0.006494140625, "learning_rate": 0.0001, "loss": 7.4804, "loss/crossentropy": 2.1522574424743652, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2046615481376648, "step": 7224 }, { "epoch": 0.451625, "grad_norm": 2.65625, "grad_norm_var": 0.008772786458333333, "learning_rate": 0.0001, "loss": 7.2808, "loss/crossentropy": 2.0967873334884644, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2074371874332428, "step": 7226 }, { "epoch": 0.45175, "grad_norm": 2.703125, "grad_norm_var": 0.010374959309895833, "learning_rate": 0.0001, "loss": 7.4005, "loss/crossentropy": 2.0253679752349854, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22992508858442307, "step": 7228 }, { "epoch": 0.451875, "grad_norm": 2.53125, "grad_norm_var": 0.009065755208333333, "learning_rate": 0.0001, "loss": 7.629, "loss/crossentropy": 2.3401378393173218, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2194272056221962, "step": 7230 }, { "epoch": 0.452, "grad_norm": 2.59375, "grad_norm_var": 0.009129842122395834, "learning_rate": 0.0001, "loss": 7.6802, "loss/crossentropy": 2.25095397233963, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22135108709335327, "step": 7232 }, { "epoch": 0.452125, "grad_norm": 2.390625, "grad_norm_var": 0.009137980143229167, "learning_rate": 0.0001, "loss": 7.3502, "loss/crossentropy": 2.407867908477783, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22576645761728287, "step": 7234 }, { "epoch": 0.45225, "grad_norm": 2.9375, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 7.6769, "loss/crossentropy": 2.357957363128662, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21956248581409454, "step": 7236 }, { "epoch": 0.452375, "grad_norm": 2.59375, "grad_norm_var": 0.025007120768229165, "learning_rate": 0.0001, "loss": 7.7281, "loss/crossentropy": 2.21242094039917, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23065873235464096, "step": 7238 }, { "epoch": 0.4525, "grad_norm": 2.4375, "grad_norm_var": 0.025658162434895833, "learning_rate": 0.0001, "loss": 7.4519, "loss/crossentropy": 2.1718413829803467, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23012802004814148, "step": 7240 }, { "epoch": 0.452625, "grad_norm": 2.40625, "grad_norm_var": 0.024054972330729167, "learning_rate": 0.0001, "loss": 7.3611, "loss/crossentropy": 2.2822405099868774, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22243288904428482, "step": 7242 }, { "epoch": 0.45275, "grad_norm": 2.65625, "grad_norm_var": 0.022884114583333334, "learning_rate": 0.0001, "loss": 7.6297, "loss/crossentropy": 2.3738062381744385, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22534438967704773, "step": 7244 }, { "epoch": 0.452875, "grad_norm": 2.9375, "grad_norm_var": 0.030524698893229167, "learning_rate": 0.0001, "loss": 7.4257, "loss/crossentropy": 2.269803285598755, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24789445102214813, "step": 7246 }, { "epoch": 0.453, "grad_norm": 2.671875, "grad_norm_var": 0.030736287434895832, "learning_rate": 0.0001, "loss": 7.6446, "loss/crossentropy": 2.3432403802871704, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.22805560380220413, "step": 7248 }, { "epoch": 0.453125, "grad_norm": 2.4375, "grad_norm_var": 0.0276763916015625, "learning_rate": 0.0001, "loss": 7.4692, "loss/crossentropy": 2.204472064971924, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23935247957706451, "step": 7250 }, { "epoch": 0.45325, "grad_norm": 2.71875, "grad_norm_var": 0.021663411458333334, "learning_rate": 0.0001, "loss": 7.394, "loss/crossentropy": 2.243639826774597, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21758561581373215, "step": 7252 }, { "epoch": 0.453375, "grad_norm": 2.625, "grad_norm_var": 0.017894490559895834, "learning_rate": 0.0001, "loss": 7.6016, "loss/crossentropy": 2.3086966276168823, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20768097043037415, "step": 7254 }, { "epoch": 0.4535, "grad_norm": 2.421875, "grad_norm_var": 0.018680826822916666, "learning_rate": 0.0001, "loss": 7.4309, "loss/crossentropy": 2.170769155025482, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22499176859855652, "step": 7256 }, { "epoch": 0.453625, "grad_norm": 2.625, "grad_norm_var": 0.017634073893229168, "learning_rate": 0.0001, "loss": 7.581, "loss/crossentropy": 2.3726818561553955, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2248682603240013, "step": 7258 }, { "epoch": 0.45375, "grad_norm": 2.578125, "grad_norm_var": 0.0168365478515625, "learning_rate": 0.0001, "loss": 7.4269, "loss/crossentropy": 2.4007468223571777, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22981403023004532, "step": 7260 }, { "epoch": 0.453875, "grad_norm": 2.578125, "grad_norm_var": 0.0487701416015625, "learning_rate": 0.0001, "loss": 7.7007, "loss/crossentropy": 2.101936459541321, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23276102542877197, "step": 7262 }, { "epoch": 0.454, "grad_norm": 2.765625, "grad_norm_var": 0.04967041015625, "learning_rate": 0.0001, "loss": 7.6061, "loss/crossentropy": 2.2694408893585205, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22756026685237885, "step": 7264 }, { "epoch": 0.454125, "grad_norm": 2.453125, "grad_norm_var": 0.049723307291666664, "learning_rate": 0.0001, "loss": 7.2311, "loss/crossentropy": 2.1606765389442444, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2117350548505783, "step": 7266 }, { "epoch": 0.45425, "grad_norm": 2.609375, "grad_norm_var": 0.051708984375, "learning_rate": 0.0001, "loss": 7.8017, "loss/crossentropy": 2.295064330101013, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24931836128234863, "step": 7268 }, { "epoch": 0.454375, "grad_norm": 2.765625, "grad_norm_var": 0.0592437744140625, "learning_rate": 0.0001, "loss": 7.7327, "loss/crossentropy": 2.4177534580230713, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24050623178482056, "step": 7270 }, { "epoch": 0.4545, "grad_norm": 2.625, "grad_norm_var": 0.05530192057291667, "learning_rate": 0.0001, "loss": 7.6006, "loss/crossentropy": 2.288196086883545, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2261577621102333, "step": 7272 }, { "epoch": 0.454625, "grad_norm": 2.421875, "grad_norm_var": 0.056689453125, "learning_rate": 0.0001, "loss": 7.4504, "loss/crossentropy": 2.2437087297439575, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.21646377444267273, "step": 7274 }, { "epoch": 0.45475, "grad_norm": 2.53125, "grad_norm_var": 0.0580718994140625, "learning_rate": 0.0001, "loss": 7.6381, "loss/crossentropy": 2.16395902633667, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2004098817706108, "step": 7276 }, { "epoch": 0.454875, "grad_norm": 2.6875, "grad_norm_var": 0.02099609375, "learning_rate": 0.0001, "loss": 7.275, "loss/crossentropy": 2.2982800006866455, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22220954298973083, "step": 7278 }, { "epoch": 0.455, "grad_norm": 2.59375, "grad_norm_var": 0.019530232747395834, "learning_rate": 0.0001, "loss": 7.5913, "loss/crossentropy": 2.1910059452056885, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2383841723203659, "step": 7280 }, { "epoch": 0.455125, "grad_norm": 2.59375, "grad_norm_var": 0.016927083333333332, "learning_rate": 0.0001, "loss": 7.4564, "loss/crossentropy": 2.3096718788146973, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23010662198066711, "step": 7282 }, { "epoch": 0.45525, "grad_norm": 2.59375, "grad_norm_var": 0.015941365559895834, "learning_rate": 0.0001, "loss": 7.4984, "loss/crossentropy": 2.341844081878662, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21651576459407806, "step": 7284 }, { "epoch": 0.455375, "grad_norm": 2.71875, "grad_norm_var": 0.0054972330729166664, "learning_rate": 0.0001, "loss": 7.596, "loss/crossentropy": 2.217790722846985, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22092267870903015, "step": 7286 }, { "epoch": 0.4555, "grad_norm": 2.546875, "grad_norm_var": 0.005370076497395833, "learning_rate": 0.0001, "loss": 7.558, "loss/crossentropy": 2.0438401103019714, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23186834901571274, "step": 7288 }, { "epoch": 0.455625, "grad_norm": 2.421875, "grad_norm_var": 0.0054972330729166664, "learning_rate": 0.0001, "loss": 7.4673, "loss/crossentropy": 2.392663598060608, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2379196509718895, "step": 7290 }, { "epoch": 0.45575, "grad_norm": 2.640625, "grad_norm_var": 0.0054972330729166664, "learning_rate": 0.0001, "loss": 7.5389, "loss/crossentropy": 2.108784556388855, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23318488895893097, "step": 7292 }, { "epoch": 0.455875, "grad_norm": 2.609375, "grad_norm_var": 0.0055328369140625, "learning_rate": 0.0001, "loss": 7.448, "loss/crossentropy": 2.354803204536438, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22564568370580673, "step": 7294 }, { "epoch": 0.456, "grad_norm": 2.515625, "grad_norm_var": 0.0060699462890625, "learning_rate": 0.0001, "loss": 7.5734, "loss/crossentropy": 2.5127440690994263, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23125988245010376, "step": 7296 }, { "epoch": 0.456125, "grad_norm": 2.3125, "grad_norm_var": 0.012874348958333334, "learning_rate": 0.0001, "loss": 7.2494, "loss/crossentropy": 1.9441785216331482, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20165621489286423, "step": 7298 }, { "epoch": 0.45625, "grad_norm": 2.6875, "grad_norm_var": 0.01568603515625, "learning_rate": 0.0001, "loss": 7.431, "loss/crossentropy": 2.32335102558136, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2277415618300438, "step": 7300 }, { "epoch": 0.456375, "grad_norm": 2.609375, "grad_norm_var": 0.01451416015625, "learning_rate": 0.0001, "loss": 7.512, "loss/crossentropy": 2.2587631940841675, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22723326086997986, "step": 7302 }, { "epoch": 0.4565, "grad_norm": 2.546875, "grad_norm_var": 0.0140045166015625, "learning_rate": 0.0001, "loss": 7.4053, "loss/crossentropy": 2.337388277053833, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23596087843179703, "step": 7304 }, { "epoch": 0.456625, "grad_norm": 2.828125, "grad_norm_var": 0.017316691080729165, "learning_rate": 0.0001, "loss": 7.688, "loss/crossentropy": 2.3093864917755127, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22000259160995483, "step": 7306 }, { "epoch": 0.45675, "grad_norm": 2.46875, "grad_norm_var": 0.01646728515625, "learning_rate": 0.0001, "loss": 7.359, "loss/crossentropy": 2.4014110565185547, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2407515048980713, "step": 7308 }, { "epoch": 0.456875, "grad_norm": 2.40625, "grad_norm_var": 0.0156402587890625, "learning_rate": 0.0001, "loss": 7.4469, "loss/crossentropy": 2.334753394126892, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22986966371536255, "step": 7310 }, { "epoch": 0.457, "grad_norm": 2.46875, "grad_norm_var": 0.016331990559895832, "learning_rate": 0.0001, "loss": 7.2952, "loss/crossentropy": 2.2548893690109253, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2305850014090538, "step": 7312 }, { "epoch": 0.457125, "grad_norm": 2.375, "grad_norm_var": 0.015067545572916667, "learning_rate": 0.0001, "loss": 7.3785, "loss/crossentropy": 2.103160858154297, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2373146265745163, "step": 7314 }, { "epoch": 0.45725, "grad_norm": 2.578125, "grad_norm_var": 0.012726847330729167, "learning_rate": 0.0001, "loss": 7.4839, "loss/crossentropy": 2.2338685989379883, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21401776373386383, "step": 7316 }, { "epoch": 0.457375, "grad_norm": 2.609375, "grad_norm_var": 0.0125640869140625, "learning_rate": 0.0001, "loss": 7.5687, "loss/crossentropy": 2.0535548329353333, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.23582682758569717, "step": 7318 }, { "epoch": 0.4575, "grad_norm": 2.5625, "grad_norm_var": 0.01470947265625, "learning_rate": 0.0001, "loss": 7.5935, "loss/crossentropy": 2.321005702018738, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23046193271875381, "step": 7320 }, { "epoch": 0.457625, "grad_norm": 2.59375, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 7.3859, "loss/crossentropy": 2.354682207107544, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22423436492681503, "step": 7322 }, { "epoch": 0.45775, "grad_norm": 2.46875, "grad_norm_var": 0.0100982666015625, "learning_rate": 0.0001, "loss": 7.4384, "loss/crossentropy": 2.0554239153862, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2251441553235054, "step": 7324 }, { "epoch": 0.457875, "grad_norm": 2.484375, "grad_norm_var": 0.009235636393229166, "learning_rate": 0.0001, "loss": 7.5404, "loss/crossentropy": 2.2049041986465454, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.21071679145097733, "step": 7326 }, { "epoch": 0.458, "grad_norm": 2.421875, "grad_norm_var": 0.009235636393229166, "learning_rate": 0.0001, "loss": 7.5748, "loss/crossentropy": 2.4776655435562134, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2443537786602974, "step": 7328 }, { "epoch": 0.458125, "grad_norm": 2.734375, "grad_norm_var": 0.009163411458333333, "learning_rate": 0.0001, "loss": 7.6177, "loss/crossentropy": 2.381058931350708, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2310986965894699, "step": 7330 }, { "epoch": 0.45825, "grad_norm": 2.703125, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 7.5344, "loss/crossentropy": 2.4733017683029175, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.20668289065361023, "step": 7332 }, { "epoch": 0.458375, "grad_norm": 2.625, "grad_norm_var": 0.010237630208333333, "learning_rate": 0.0001, "loss": 7.4604, "loss/crossentropy": 2.397672414779663, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2173973247408867, "step": 7334 }, { "epoch": 0.4585, "grad_norm": 2.5, "grad_norm_var": 0.008675130208333333, "learning_rate": 0.0001, "loss": 7.5166, "loss/crossentropy": 2.092658042907715, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23045500367879868, "step": 7336 }, { "epoch": 0.458625, "grad_norm": 2.5625, "grad_norm_var": 0.008250935872395834, "learning_rate": 0.0001, "loss": 7.6074, "loss/crossentropy": 2.477524757385254, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2453545778989792, "step": 7338 }, { "epoch": 0.45875, "grad_norm": 2.515625, "grad_norm_var": 0.0077707926432291664, "learning_rate": 0.0001, "loss": 7.3387, "loss/crossentropy": 2.2862935066223145, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23493624478578568, "step": 7340 }, { "epoch": 0.458875, "grad_norm": 2.640625, "grad_norm_var": 0.008552042643229167, "learning_rate": 0.0001, "loss": 7.6519, "loss/crossentropy": 2.32111394405365, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.24085529148578644, "step": 7342 }, { "epoch": 0.459, "grad_norm": 2.53125, "grad_norm_var": 0.007666015625, "learning_rate": 0.0001, "loss": 7.2703, "loss/crossentropy": 2.290968418121338, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22057055681943893, "step": 7344 }, { "epoch": 0.459125, "grad_norm": 2.578125, "grad_norm_var": 0.0058095296223958336, "learning_rate": 0.0001, "loss": 7.4053, "loss/crossentropy": 2.1614257097244263, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2244672030210495, "step": 7346 }, { "epoch": 0.45925, "grad_norm": 2.484375, "grad_norm_var": 0.0055816650390625, "learning_rate": 0.0001, "loss": 7.4303, "loss/crossentropy": 2.3572819232940674, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23413042724132538, "step": 7348 }, { "epoch": 0.459375, "grad_norm": 2.375, "grad_norm_var": 0.007372029622395833, "learning_rate": 0.0001, "loss": 7.5297, "loss/crossentropy": 2.119737684726715, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.23356341570615768, "step": 7350 }, { "epoch": 0.4595, "grad_norm": 2.546875, "grad_norm_var": 0.018236287434895835, "learning_rate": 0.0001, "loss": 7.5274, "loss/crossentropy": 2.4658172130584717, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22511006891727448, "step": 7352 }, { "epoch": 0.459625, "grad_norm": 2.515625, "grad_norm_var": 0.018333943684895833, "learning_rate": 0.0001, "loss": 7.412, "loss/crossentropy": 2.3136802911758423, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22880984097719193, "step": 7354 }, { "epoch": 0.45975, "grad_norm": 2.640625, "grad_norm_var": 0.01767578125, "learning_rate": 0.0001, "loss": 7.4919, "loss/crossentropy": 2.2620712518692017, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.21725363284349442, "step": 7356 }, { "epoch": 0.459875, "grad_norm": 2.71875, "grad_norm_var": 0.019807942708333335, "learning_rate": 0.0001, "loss": 7.5364, "loss/crossentropy": 2.3213605880737305, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22434928268194199, "step": 7358 }, { "epoch": 0.46, "grad_norm": 2.53125, "grad_norm_var": 0.019950358072916667, "learning_rate": 0.0001, "loss": 7.3695, "loss/crossentropy": 2.2258609533309937, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22187072038650513, "step": 7360 }, { "epoch": 0.460125, "grad_norm": 2.625, "grad_norm_var": 0.022021484375, "learning_rate": 0.0001, "loss": 7.5633, "loss/crossentropy": 2.007631301879883, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2181134819984436, "step": 7362 }, { "epoch": 0.46025, "grad_norm": 2.609375, "grad_norm_var": 0.019847615559895834, "learning_rate": 0.0001, "loss": 7.5982, "loss/crossentropy": 2.4932351112365723, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2308068796992302, "step": 7364 }, { "epoch": 0.460375, "grad_norm": 2.5625, "grad_norm_var": 0.016695149739583335, "learning_rate": 0.0001, "loss": 7.4733, "loss/crossentropy": 2.4338245391845703, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22848600149154663, "step": 7366 }, { "epoch": 0.4605, "grad_norm": 2.6875, "grad_norm_var": 0.008675130208333333, "learning_rate": 0.0001, "loss": 7.5352, "loss/crossentropy": 2.1546473503112793, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21617963910102844, "step": 7368 }, { "epoch": 0.460625, "grad_norm": 2.640625, "grad_norm_var": 0.008626302083333334, "learning_rate": 0.0001, "loss": 7.639, "loss/crossentropy": 2.379016637802124, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24141156673431396, "step": 7370 }, { "epoch": 0.46075, "grad_norm": 2.75, "grad_norm_var": 0.0111328125, "learning_rate": 0.0001, "loss": 7.6232, "loss/crossentropy": 2.54293692111969, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.24175000190734863, "step": 7372 }, { "epoch": 0.460875, "grad_norm": 3.328125, "grad_norm_var": 0.04127197265625, "learning_rate": 0.0001, "loss": 7.4297, "loss/crossentropy": 2.212146759033203, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22513853013515472, "step": 7374 }, { "epoch": 0.461, "grad_norm": 2.90625, "grad_norm_var": 0.04377339680989583, "learning_rate": 0.0001, "loss": 7.6515, "loss/crossentropy": 2.2985663414001465, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23076993972063065, "step": 7376 }, { "epoch": 0.461125, "grad_norm": 2.578125, "grad_norm_var": 0.044270833333333336, "learning_rate": 0.0001, "loss": 7.5393, "loss/crossentropy": 2.085919678211212, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22570392489433289, "step": 7378 }, { "epoch": 0.46125, "grad_norm": 2.59375, "grad_norm_var": 0.04566650390625, "learning_rate": 0.0001, "loss": 7.8112, "loss/crossentropy": 2.50254225730896, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24986283481121063, "step": 7380 }, { "epoch": 0.461375, "grad_norm": 2.578125, "grad_norm_var": 0.04973551432291667, "learning_rate": 0.0001, "loss": 7.3371, "loss/crossentropy": 2.304502844810486, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22612842917442322, "step": 7382 }, { "epoch": 0.4615, "grad_norm": 2.65625, "grad_norm_var": 0.04895833333333333, "learning_rate": 0.0001, "loss": 7.5115, "loss/crossentropy": 2.145179510116577, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23642145842313766, "step": 7384 }, { "epoch": 0.461625, "grad_norm": 2.515625, "grad_norm_var": 0.0514068603515625, "learning_rate": 0.0001, "loss": 7.4777, "loss/crossentropy": 1.9638743996620178, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21321027725934982, "step": 7386 }, { "epoch": 0.46175, "grad_norm": 2.46875, "grad_norm_var": 0.0509185791015625, "learning_rate": 0.0001, "loss": 7.4851, "loss/crossentropy": 2.0539122819900513, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22125357389450073, "step": 7388 }, { "epoch": 0.461875, "grad_norm": 2.59375, "grad_norm_var": 0.016633097330729166, "learning_rate": 0.0001, "loss": 7.6363, "loss/crossentropy": 2.463230848312378, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22182685136795044, "step": 7390 }, { "epoch": 0.462, "grad_norm": 2.640625, "grad_norm_var": 0.009163411458333333, "learning_rate": 0.0001, "loss": 7.5917, "loss/crossentropy": 2.3424625396728516, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2190731316804886, "step": 7392 }, { "epoch": 0.462125, "grad_norm": 2.640625, "grad_norm_var": 0.011262003580729167, "learning_rate": 0.0001, "loss": 7.3018, "loss/crossentropy": 2.0450503826141357, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2240830734372139, "step": 7394 }, { "epoch": 0.46225, "grad_norm": 2.5, "grad_norm_var": 0.0065826416015625, "learning_rate": 0.0001, "loss": 7.3525, "loss/crossentropy": 2.0469712018966675, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.21751779317855835, "step": 7396 }, { "epoch": 0.462375, "grad_norm": 2.890625, "grad_norm_var": 0.014216105143229166, "learning_rate": 0.0001, "loss": 7.6664, "loss/crossentropy": 2.1431461572647095, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22404006868600845, "step": 7398 }, { "epoch": 0.4625, "grad_norm": 2.4375, "grad_norm_var": 0.015327962239583333, "learning_rate": 0.0001, "loss": 7.4696, "loss/crossentropy": 2.4352341890335083, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.20735271275043488, "step": 7400 }, { "epoch": 0.462625, "grad_norm": 2.5625, "grad_norm_var": 0.0153472900390625, "learning_rate": 0.0001, "loss": 7.7085, "loss/crossentropy": 2.2785404920578003, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22813603281974792, "step": 7402 }, { "epoch": 0.46275, "grad_norm": 2.515625, "grad_norm_var": 0.015754191080729167, "learning_rate": 0.0001, "loss": 7.4219, "loss/crossentropy": 2.2228078842163086, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2266954481601715, "step": 7404 }, { "epoch": 0.462875, "grad_norm": 2.609375, "grad_norm_var": 0.0156646728515625, "learning_rate": 0.0001, "loss": 7.5373, "loss/crossentropy": 2.2263147830963135, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.25031523406505585, "step": 7406 }, { "epoch": 0.463, "grad_norm": 2.515625, "grad_norm_var": 0.015718587239583335, "learning_rate": 0.0001, "loss": 7.4341, "loss/crossentropy": 2.1717947721481323, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22138065099716187, "step": 7408 }, { "epoch": 0.463125, "grad_norm": 2.65625, "grad_norm_var": 0.013736979166666666, "learning_rate": 0.0001, "loss": 7.369, "loss/crossentropy": 2.028733789920807, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22024127840995789, "step": 7410 }, { "epoch": 0.46325, "grad_norm": 2.484375, "grad_norm_var": 0.015998331705729167, "learning_rate": 0.0001, "loss": 7.3025, "loss/crossentropy": 2.1332881450653076, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21039804816246033, "step": 7412 }, { "epoch": 0.463375, "grad_norm": 2.609375, "grad_norm_var": 0.008307902018229167, "learning_rate": 0.0001, "loss": 7.5216, "loss/crossentropy": 2.274906635284424, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22897058725357056, "step": 7414 }, { "epoch": 0.4635, "grad_norm": 2.453125, "grad_norm_var": 0.0080718994140625, "learning_rate": 0.0001, "loss": 7.3344, "loss/crossentropy": 2.4648267030715942, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.224023275077343, "step": 7416 }, { "epoch": 0.463625, "grad_norm": 2.609375, "grad_norm_var": 0.007926432291666667, "learning_rate": 0.0001, "loss": 7.5636, "loss/crossentropy": 2.4589834213256836, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23085740953683853, "step": 7418 }, { "epoch": 0.46375, "grad_norm": 2.375, "grad_norm_var": 0.009391276041666667, "learning_rate": 0.0001, "loss": 7.1444, "loss/crossentropy": 2.172826886177063, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22426262497901917, "step": 7420 }, { "epoch": 0.463875, "grad_norm": 2.5625, "grad_norm_var": 0.01002197265625, "learning_rate": 0.0001, "loss": 7.572, "loss/crossentropy": 2.294726014137268, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21942047029733658, "step": 7422 }, { "epoch": 0.464, "grad_norm": 2.484375, "grad_norm_var": 0.012532552083333334, "learning_rate": 0.0001, "loss": 7.3559, "loss/crossentropy": 2.0881186723709106, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.2111830785870552, "step": 7424 }, { "epoch": 0.464125, "grad_norm": 2.71875, "grad_norm_var": 0.01383056640625, "learning_rate": 0.0001, "loss": 7.6726, "loss/crossentropy": 2.3748130798339844, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23443245887756348, "step": 7426 }, { "epoch": 0.46425, "grad_norm": 2.546875, "grad_norm_var": 0.009430948893229167, "learning_rate": 0.0001, "loss": 7.5225, "loss/crossentropy": 2.2903072834014893, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2022785320878029, "step": 7428 }, { "epoch": 0.464375, "grad_norm": 2.578125, "grad_norm_var": 0.01181640625, "learning_rate": 0.0001, "loss": 7.4494, "loss/crossentropy": 2.1331697702407837, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21722245961427689, "step": 7430 }, { "epoch": 0.4645, "grad_norm": 2.5625, "grad_norm_var": 0.0125, "learning_rate": 0.0001, "loss": 7.4794, "loss/crossentropy": 2.4622026681900024, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2228771075606346, "step": 7432 }, { "epoch": 0.464625, "grad_norm": 2.75, "grad_norm_var": 0.015404256184895833, "learning_rate": 0.0001, "loss": 7.653, "loss/crossentropy": 2.479435443878174, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2429051250219345, "step": 7434 }, { "epoch": 0.46475, "grad_norm": 3.484375, "grad_norm_var": 0.06877848307291666, "learning_rate": 0.0001, "loss": 7.5261, "loss/crossentropy": 2.310993194580078, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2262483835220337, "step": 7436 }, { "epoch": 0.464875, "grad_norm": 2.578125, "grad_norm_var": 0.06847330729166666, "learning_rate": 0.0001, "loss": 7.6452, "loss/crossentropy": 2.315868616104126, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2104288637638092, "step": 7438 }, { "epoch": 0.465, "grad_norm": 2.609375, "grad_norm_var": 0.06376851399739583, "learning_rate": 0.0001, "loss": 7.5483, "loss/crossentropy": 2.3393532037734985, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22894802689552307, "step": 7440 }, { "epoch": 0.465125, "grad_norm": 2.71875, "grad_norm_var": 0.06433003743489583, "learning_rate": 0.0001, "loss": 7.3906, "loss/crossentropy": 2.1513169407844543, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.19916437566280365, "step": 7442 }, { "epoch": 0.46525, "grad_norm": 2.625, "grad_norm_var": 0.15073954264322917, "learning_rate": 0.0001, "loss": 7.6772, "loss/crossentropy": 2.3496642112731934, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24646421521902084, "step": 7444 }, { "epoch": 0.465375, "grad_norm": 2.46875, "grad_norm_var": 0.14439188639322917, "learning_rate": 0.0001, "loss": 7.3125, "loss/crossentropy": 2.1705400943756104, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22589680552482605, "step": 7446 }, { "epoch": 0.4655, "grad_norm": 2.546875, "grad_norm_var": 0.14058837890625, "learning_rate": 0.0001, "loss": 7.3425, "loss/crossentropy": 2.3591222763061523, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21697549521923065, "step": 7448 }, { "epoch": 0.465625, "grad_norm": 2.375, "grad_norm_var": 0.1466949462890625, "learning_rate": 0.0001, "loss": 7.32, "loss/crossentropy": 2.175205707550049, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22044803947210312, "step": 7450 }, { "epoch": 0.46575, "grad_norm": 2.46875, "grad_norm_var": 0.10515848795572917, "learning_rate": 0.0001, "loss": 7.4624, "loss/crossentropy": 2.2731419801712036, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2193523496389389, "step": 7452 }, { "epoch": 0.465875, "grad_norm": 2.703125, "grad_norm_var": 0.1047760009765625, "learning_rate": 0.0001, "loss": 7.6741, "loss/crossentropy": 2.3229037523269653, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2360212579369545, "step": 7454 }, { "epoch": 0.466, "grad_norm": 2.796875, "grad_norm_var": 0.10621337890625, "learning_rate": 0.0001, "loss": 7.4393, "loss/crossentropy": 2.2018871307373047, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2167951464653015, "step": 7456 }, { "epoch": 0.466125, "grad_norm": 2.484375, "grad_norm_var": 0.10816650390625, "learning_rate": 0.0001, "loss": 7.4345, "loss/crossentropy": 2.2210224866867065, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.22730884701013565, "step": 7458 }, { "epoch": 0.46625, "grad_norm": 2.515625, "grad_norm_var": 0.014134724934895834, "learning_rate": 0.0001, "loss": 7.4869, "loss/crossentropy": 2.150461435317993, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.22167368233203888, "step": 7460 }, { "epoch": 0.466375, "grad_norm": 2.765625, "grad_norm_var": 0.013362630208333334, "learning_rate": 0.0001, "loss": 7.492, "loss/crossentropy": 2.3534340858459473, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2165219485759735, "step": 7462 }, { "epoch": 0.4665, "grad_norm": 2.625, "grad_norm_var": 0.01402587890625, "learning_rate": 0.0001, "loss": 7.5818, "loss/crossentropy": 2.2799781560897827, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.22724565118551254, "step": 7464 }, { "epoch": 0.466625, "grad_norm": 2.5625, "grad_norm_var": 0.011751302083333333, "learning_rate": 0.0001, "loss": 7.4955, "loss/crossentropy": 2.2778738737106323, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23060085624456406, "step": 7466 }, { "epoch": 0.46675, "grad_norm": 2.640625, "grad_norm_var": 0.010888671875, "learning_rate": 0.0001, "loss": 7.4369, "loss/crossentropy": 2.3281946182250977, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2244219332933426, "step": 7468 }, { "epoch": 0.466875, "grad_norm": 2.59375, "grad_norm_var": 0.010090128580729166, "learning_rate": 0.0001, "loss": 7.6298, "loss/crossentropy": 2.0607125759124756, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21379946917295456, "step": 7470 }, { "epoch": 0.467, "grad_norm": 2.5625, "grad_norm_var": 0.006734212239583333, "learning_rate": 0.0001, "loss": 7.5543, "loss/crossentropy": 2.0742074847221375, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.24517051875591278, "step": 7472 }, { "epoch": 0.467125, "grad_norm": 2.65625, "grad_norm_var": 0.006078084309895833, "learning_rate": 0.0001, "loss": 7.4869, "loss/crossentropy": 2.1688080430030823, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.22626544535160065, "step": 7474 }, { "epoch": 0.46725, "grad_norm": 2.65625, "grad_norm_var": 0.006722005208333334, "learning_rate": 0.0001, "loss": 7.2733, "loss/crossentropy": 1.9243706464767456, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.20830260962247849, "step": 7476 }, { "epoch": 0.467375, "grad_norm": 2.515625, "grad_norm_var": 0.006201171875, "learning_rate": 0.0001, "loss": 7.7622, "loss/crossentropy": 2.130434989929199, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.24139249324798584, "step": 7478 }, { "epoch": 0.4675, "grad_norm": 2.671875, "grad_norm_var": 0.0072418212890625, "learning_rate": 0.0001, "loss": 7.5473, "loss/crossentropy": 2.068904399871826, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.21402664482593536, "step": 7480 }, { "epoch": 0.467625, "grad_norm": 2.640625, "grad_norm_var": 0.006966145833333334, "learning_rate": 0.0001, "loss": 7.4063, "loss/crossentropy": 2.2135156989097595, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21822866797447205, "step": 7482 }, { "epoch": 0.46775, "grad_norm": 2.75, "grad_norm_var": 0.0164459228515625, "learning_rate": 0.0001, "loss": 7.683, "loss/crossentropy": 2.5468629598617554, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2268260270357132, "step": 7484 }, { "epoch": 0.467875, "grad_norm": 2.578125, "grad_norm_var": 0.01656494140625, "learning_rate": 0.0001, "loss": 7.5816, "loss/crossentropy": 2.1518990993499756, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2527836561203003, "step": 7486 }, { "epoch": 0.468, "grad_norm": 2.59375, "grad_norm_var": 0.0162506103515625, "learning_rate": 0.0001, "loss": 7.5217, "loss/crossentropy": 2.2250465154647827, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22572959959506989, "step": 7488 }, { "epoch": 0.468125, "grad_norm": 2.46875, "grad_norm_var": 0.018147786458333332, "learning_rate": 0.0001, "loss": 7.4076, "loss/crossentropy": 2.245205879211426, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.2194642722606659, "step": 7490 }, { "epoch": 0.46825, "grad_norm": 2.46875, "grad_norm_var": 0.018229166666666668, "learning_rate": 0.0001, "loss": 7.3504, "loss/crossentropy": 2.1961398124694824, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22188910841941833, "step": 7492 }, { "epoch": 0.468375, "grad_norm": 2.71875, "grad_norm_var": 0.018480428059895835, "learning_rate": 0.0001, "loss": 7.4658, "loss/crossentropy": 2.1420774459838867, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2425178736448288, "step": 7494 }, { "epoch": 0.4685, "grad_norm": 2.578125, "grad_norm_var": 0.017073567708333334, "learning_rate": 0.0001, "loss": 7.6836, "loss/crossentropy": 2.347414970397949, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.21874912083148956, "step": 7496 }, { "epoch": 0.468625, "grad_norm": 2.5625, "grad_norm_var": 0.0167144775390625, "learning_rate": 0.0001, "loss": 7.3137, "loss/crossentropy": 2.179791808128357, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21431612968444824, "step": 7498 }, { "epoch": 0.46875, "grad_norm": 2.5625, "grad_norm_var": 0.005757649739583333, "learning_rate": 0.0001, "loss": 7.7105, "loss/crossentropy": 2.3909919261932373, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24787569046020508, "step": 7500 }, { "epoch": 0.468875, "grad_norm": 2.578125, "grad_norm_var": 0.007515462239583334, "learning_rate": 0.0001, "loss": 7.2335, "loss/crossentropy": 2.2461036443710327, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.21556192636489868, "step": 7502 }, { "epoch": 0.469, "grad_norm": 2.484375, "grad_norm_var": 0.008446248372395833, "learning_rate": 0.0001, "loss": 7.6579, "loss/crossentropy": 2.5435656309127808, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.23903077840805054, "step": 7504 }, { "epoch": 0.469125, "grad_norm": 2.53125, "grad_norm_var": 0.0088043212890625, "learning_rate": 0.0001, "loss": 7.4871, "loss/crossentropy": 2.3575626611709595, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.22954238206148148, "step": 7506 }, { "epoch": 0.46925, "grad_norm": 2.671875, "grad_norm_var": 0.008382161458333334, "learning_rate": 0.0001, "loss": 7.5142, "loss/crossentropy": 2.5515412092208862, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23818785697221756, "step": 7508 }, { "epoch": 0.469375, "grad_norm": 2.53125, "grad_norm_var": 0.006298828125, "learning_rate": 0.0001, "loss": 7.3778, "loss/crossentropy": 2.3858693838119507, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23035944998264313, "step": 7510 }, { "epoch": 0.4695, "grad_norm": 2.5625, "grad_norm_var": 0.009566243489583333, "learning_rate": 0.0001, "loss": 7.1863, "loss/crossentropy": 2.304104208946228, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21947531402111053, "step": 7512 }, { "epoch": 0.469625, "grad_norm": 2.546875, "grad_norm_var": 0.009723917643229166, "learning_rate": 0.0001, "loss": 7.3938, "loss/crossentropy": 2.3828344345092773, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22297868132591248, "step": 7514 }, { "epoch": 0.46975, "grad_norm": 2.484375, "grad_norm_var": 0.009772745768229167, "learning_rate": 0.0001, "loss": 7.5845, "loss/crossentropy": 2.1664215326309204, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21769656985998154, "step": 7516 }, { "epoch": 0.469875, "grad_norm": 2.578125, "grad_norm_var": 0.008796183268229167, "learning_rate": 0.0001, "loss": 7.3356, "loss/crossentropy": 2.3967437744140625, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21602225303649902, "step": 7518 }, { "epoch": 0.47, "grad_norm": 2.59375, "grad_norm_var": 0.007942708333333333, "learning_rate": 0.0001, "loss": 7.5093, "loss/crossentropy": 2.4606828689575195, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22484609484672546, "step": 7520 }, { "epoch": 0.470125, "grad_norm": 2.90625, "grad_norm_var": 0.015234375, "learning_rate": 0.0001, "loss": 7.7304, "loss/crossentropy": 2.496990203857422, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2282068431377411, "step": 7522 }, { "epoch": 0.47025, "grad_norm": 2.5, "grad_norm_var": 0.014351399739583333, "learning_rate": 0.0001, "loss": 7.5532, "loss/crossentropy": 2.366703510284424, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21963699162006378, "step": 7524 }, { "epoch": 0.470375, "grad_norm": 2.515625, "grad_norm_var": 0.014567057291666666, "learning_rate": 0.0001, "loss": 7.1565, "loss/crossentropy": 1.8731043934822083, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21080220490694046, "step": 7526 }, { "epoch": 0.4705, "grad_norm": 2.328125, "grad_norm_var": 0.014518229166666667, "learning_rate": 0.0001, "loss": 7.1057, "loss/crossentropy": 2.1666380167007446, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21890105307102203, "step": 7528 }, { "epoch": 0.470625, "grad_norm": 2.625, "grad_norm_var": 0.015754191080729167, "learning_rate": 0.0001, "loss": 7.496, "loss/crossentropy": 2.274445652961731, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22877724468708038, "step": 7530 }, { "epoch": 0.47075, "grad_norm": 2.609375, "grad_norm_var": 0.015458170572916667, "learning_rate": 0.0001, "loss": 7.6616, "loss/crossentropy": 2.6539989709854126, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.24405335634946823, "step": 7532 }, { "epoch": 0.470875, "grad_norm": 2.5, "grad_norm_var": 0.015625, "learning_rate": 0.0001, "loss": 7.8128, "loss/crossentropy": 2.7639050483703613, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24323812872171402, "step": 7534 }, { "epoch": 0.471, "grad_norm": 2.484375, "grad_norm_var": 0.015641276041666666, "learning_rate": 0.0001, "loss": 7.3938, "loss/crossentropy": 2.049751400947571, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.21793736517429352, "step": 7536 }, { "epoch": 0.471125, "grad_norm": 2.515625, "grad_norm_var": 0.00650634765625, "learning_rate": 0.0001, "loss": 7.6296, "loss/crossentropy": 2.3478161096572876, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.22824416309595108, "step": 7538 }, { "epoch": 0.47125, "grad_norm": 2.53125, "grad_norm_var": 0.0066070556640625, "learning_rate": 0.0001, "loss": 7.4823, "loss/crossentropy": 2.3621110916137695, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20483282953500748, "step": 7540 }, { "epoch": 0.471375, "grad_norm": 2.71875, "grad_norm_var": 0.008112589518229166, "learning_rate": 0.0001, "loss": 7.3978, "loss/crossentropy": 1.9511004090309143, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22462377697229385, "step": 7542 }, { "epoch": 0.4715, "grad_norm": 2.484375, "grad_norm_var": 0.005106608072916667, "learning_rate": 0.0001, "loss": 7.3796, "loss/crossentropy": 2.2452269792556763, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2183745950460434, "step": 7544 }, { "epoch": 0.471625, "grad_norm": 2.546875, "grad_norm_var": 0.004313151041666667, "learning_rate": 0.0001, "loss": 7.3915, "loss/crossentropy": 2.2871146202087402, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.20815083384513855, "step": 7546 }, { "epoch": 0.47175, "grad_norm": 2.421875, "grad_norm_var": 0.0051422119140625, "learning_rate": 0.0001, "loss": 7.2837, "loss/crossentropy": 2.0210039019584656, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2112835943698883, "step": 7548 }, { "epoch": 0.471875, "grad_norm": 2.578125, "grad_norm_var": 0.005475870768229167, "learning_rate": 0.0001, "loss": 7.44, "loss/crossentropy": 2.2161262035369873, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.19441009312868118, "step": 7550 }, { "epoch": 0.472, "grad_norm": 2.578125, "grad_norm_var": 0.005866495768229166, "learning_rate": 0.0001, "loss": 7.257, "loss/crossentropy": 2.1528568267822266, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.20396533608436584, "step": 7552 }, { "epoch": 0.472125, "grad_norm": 2.515625, "grad_norm_var": 0.005790201822916666, "learning_rate": 0.0001, "loss": 7.5612, "loss/crossentropy": 2.479175329208374, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23107275366783142, "step": 7554 }, { "epoch": 0.47225, "grad_norm": 2.5625, "grad_norm_var": 0.005826822916666667, "learning_rate": 0.0001, "loss": 7.4033, "loss/crossentropy": 2.127906084060669, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.22376175969839096, "step": 7556 }, { "epoch": 0.472375, "grad_norm": 2.796875, "grad_norm_var": 0.05636393229166667, "learning_rate": 0.0001, "loss": 7.7982, "loss/crossentropy": 2.5513200759887695, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2600383460521698, "step": 7558 }, { "epoch": 0.4725, "grad_norm": 2.640625, "grad_norm_var": 0.056103515625, "learning_rate": 0.0001, "loss": 7.7034, "loss/crossentropy": 2.214058756828308, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.22493215650320053, "step": 7560 }, { "epoch": 0.472625, "grad_norm": 2.578125, "grad_norm_var": 0.05635477701822917, "learning_rate": 0.0001, "loss": 7.2966, "loss/crossentropy": 2.2738345861434937, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21413611620664597, "step": 7562 }, { "epoch": 0.47275, "grad_norm": 2.4375, "grad_norm_var": 0.056029256184895834, "learning_rate": 0.0001, "loss": 7.494, "loss/crossentropy": 2.328799247741699, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21939349174499512, "step": 7564 }, { "epoch": 0.472875, "grad_norm": 2.515625, "grad_norm_var": 0.05562744140625, "learning_rate": 0.0001, "loss": 7.4758, "loss/crossentropy": 2.453641414642334, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2408982664346695, "step": 7566 }, { "epoch": 0.473, "grad_norm": 2.546875, "grad_norm_var": 0.056050618489583336, "learning_rate": 0.0001, "loss": 7.487, "loss/crossentropy": 2.296906352043152, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23819995671510696, "step": 7568 }, { "epoch": 0.473125, "grad_norm": 2.515625, "grad_norm_var": 0.05886128743489583, "learning_rate": 0.0001, "loss": 7.4547, "loss/crossentropy": 2.109784483909607, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21301881223917007, "step": 7570 }, { "epoch": 0.47325, "grad_norm": 2.734375, "grad_norm_var": 0.05712483723958333, "learning_rate": 0.0001, "loss": 7.6148, "loss/crossentropy": 2.5796960592269897, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2447682023048401, "step": 7572 }, { "epoch": 0.473375, "grad_norm": 2.765625, "grad_norm_var": 0.013752237955729166, "learning_rate": 0.0001, "loss": 7.5227, "loss/crossentropy": 2.021949529647827, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.24036672711372375, "step": 7574 }, { "epoch": 0.4735, "grad_norm": 2.796875, "grad_norm_var": 0.015656534830729166, "learning_rate": 0.0001, "loss": 7.6206, "loss/crossentropy": 2.4127408266067505, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2284799963235855, "step": 7576 }, { "epoch": 0.473625, "grad_norm": 2.46875, "grad_norm_var": 0.016380818684895833, "learning_rate": 0.0001, "loss": 7.5752, "loss/crossentropy": 2.255155086517334, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24932511150836945, "step": 7578 }, { "epoch": 0.47375, "grad_norm": 2.40625, "grad_norm_var": 0.0178619384765625, "learning_rate": 0.0001, "loss": 7.3951, "loss/crossentropy": 2.166309952735901, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2227797880768776, "step": 7580 }, { "epoch": 0.473875, "grad_norm": 2.40625, "grad_norm_var": 0.0192047119140625, "learning_rate": 0.0001, "loss": 7.3734, "loss/crossentropy": 2.4856892824172974, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2069193795323372, "step": 7582 }, { "epoch": 0.474, "grad_norm": 2.5, "grad_norm_var": 0.016340128580729165, "learning_rate": 0.0001, "loss": 7.4122, "loss/crossentropy": 2.2524434328079224, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22085583955049515, "step": 7584 }, { "epoch": 0.474125, "grad_norm": 2.5625, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 7.5415, "loss/crossentropy": 2.21351957321167, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2274496853351593, "step": 7586 }, { "epoch": 0.47425, "grad_norm": 2.453125, "grad_norm_var": 0.0136138916015625, "learning_rate": 0.0001, "loss": 7.4996, "loss/crossentropy": 2.208716034889221, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.2340795323252678, "step": 7588 }, { "epoch": 0.474375, "grad_norm": 2.71875, "grad_norm_var": 0.011498006184895833, "learning_rate": 0.0001, "loss": 7.3798, "loss/crossentropy": 2.2372665405273438, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.20566194504499435, "step": 7590 }, { "epoch": 0.4745, "grad_norm": 2.796875, "grad_norm_var": 0.015038045247395833, "learning_rate": 0.0001, "loss": 7.3776, "loss/crossentropy": 2.244900107383728, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22651128470897675, "step": 7592 }, { "epoch": 0.474625, "grad_norm": 2.578125, "grad_norm_var": 0.0149078369140625, "learning_rate": 0.0001, "loss": 7.4526, "loss/crossentropy": 2.165702223777771, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22590532898902893, "step": 7594 }, { "epoch": 0.47475, "grad_norm": 2.46875, "grad_norm_var": 0.013557942708333333, "learning_rate": 0.0001, "loss": 7.3453, "loss/crossentropy": 2.1199495792388916, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21129493415355682, "step": 7596 }, { "epoch": 0.474875, "grad_norm": 2.4375, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 7.3065, "loss/crossentropy": 2.2031368017196655, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22692148387432098, "step": 7598 }, { "epoch": 0.475, "grad_norm": 2.703125, "grad_norm_var": 0.014892578125, "learning_rate": 0.0001, "loss": 7.5472, "loss/crossentropy": 2.367129683494568, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22869062423706055, "step": 7600 }, { "epoch": 0.475125, "grad_norm": 2.546875, "grad_norm_var": 0.0157135009765625, "learning_rate": 0.0001, "loss": 7.4162, "loss/crossentropy": 2.3395384550094604, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2049834504723549, "step": 7602 }, { "epoch": 0.47525, "grad_norm": 2.609375, "grad_norm_var": 0.015576171875, "learning_rate": 0.0001, "loss": 7.9266, "loss/crossentropy": 2.5983808040618896, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.27822645008563995, "step": 7604 }, { "epoch": 0.475375, "grad_norm": 2.421875, "grad_norm_var": 0.016109212239583334, "learning_rate": 0.0001, "loss": 7.5688, "loss/crossentropy": 2.232957601547241, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2200288102030754, "step": 7606 }, { "epoch": 0.4755, "grad_norm": 2.53125, "grad_norm_var": 0.009959920247395834, "learning_rate": 0.0001, "loss": 7.3359, "loss/crossentropy": 2.4035634994506836, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.21612004935741425, "step": 7608 }, { "epoch": 0.475625, "grad_norm": 2.640625, "grad_norm_var": 0.01148681640625, "learning_rate": 0.0001, "loss": 7.5726, "loss/crossentropy": 2.2749183177948, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21551556140184402, "step": 7610 }, { "epoch": 0.47575, "grad_norm": 2.5, "grad_norm_var": 0.0110992431640625, "learning_rate": 0.0001, "loss": 7.3482, "loss/crossentropy": 2.165304183959961, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23403477668762207, "step": 7612 }, { "epoch": 0.475875, "grad_norm": 2.359375, "grad_norm_var": 0.012555948893229167, "learning_rate": 0.0001, "loss": 7.3341, "loss/crossentropy": 2.44111430644989, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21368762850761414, "step": 7614 }, { "epoch": 0.476, "grad_norm": 2.765625, "grad_norm_var": 0.01431884765625, "learning_rate": 0.0001, "loss": 7.6824, "loss/crossentropy": 2.5847959518432617, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.23075634986162186, "step": 7616 }, { "epoch": 0.476125, "grad_norm": 2.578125, "grad_norm_var": 0.013834635416666666, "learning_rate": 0.0001, "loss": 7.5093, "loss/crossentropy": 2.39164662361145, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21891236305236816, "step": 7618 }, { "epoch": 0.47625, "grad_norm": 2.5625, "grad_norm_var": 0.011571248372395834, "learning_rate": 0.0001, "loss": 7.6706, "loss/crossentropy": 2.396565318107605, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2194042131304741, "step": 7620 }, { "epoch": 0.476375, "grad_norm": 2.515625, "grad_norm_var": 0.010383097330729167, "learning_rate": 0.0001, "loss": 7.4296, "loss/crossentropy": 2.4205228090286255, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.24130597710609436, "step": 7622 }, { "epoch": 0.4765, "grad_norm": 2.5625, "grad_norm_var": 0.009944661458333334, "learning_rate": 0.0001, "loss": 7.8026, "loss/crossentropy": 2.3281376361846924, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2429886758327484, "step": 7624 }, { "epoch": 0.476625, "grad_norm": 2.546875, "grad_norm_var": 0.010139973958333333, "learning_rate": 0.0001, "loss": 7.3055, "loss/crossentropy": 2.0070475339889526, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21404751390218735, "step": 7626 }, { "epoch": 0.47675, "grad_norm": 2.515625, "grad_norm_var": 0.010481770833333333, "learning_rate": 0.0001, "loss": 7.3008, "loss/crossentropy": 2.178444743156433, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21427331119775772, "step": 7628 }, { "epoch": 0.476875, "grad_norm": 2.796875, "grad_norm_var": 0.010944620768229166, "learning_rate": 0.0001, "loss": 7.4672, "loss/crossentropy": 2.4490493535995483, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.25556573271751404, "step": 7630 }, { "epoch": 0.477, "grad_norm": 2.515625, "grad_norm_var": 0.007645670572916667, "learning_rate": 0.0001, "loss": 7.4375, "loss/crossentropy": 2.271330714225769, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22136002033948898, "step": 7632 }, { "epoch": 0.477125, "grad_norm": 2.484375, "grad_norm_var": 0.008185831705729167, "learning_rate": 0.0001, "loss": 7.4263, "loss/crossentropy": 2.32911217212677, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.209008127450943, "step": 7634 }, { "epoch": 0.47725, "grad_norm": 2.4375, "grad_norm_var": 0.009935506184895833, "learning_rate": 0.0001, "loss": 7.5227, "loss/crossentropy": 1.967372179031372, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.20427028834819794, "step": 7636 }, { "epoch": 0.477375, "grad_norm": 2.484375, "grad_norm_var": 0.010184733072916667, "learning_rate": 0.0001, "loss": 7.3611, "loss/crossentropy": 2.032367706298828, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.222863607108593, "step": 7638 }, { "epoch": 0.4775, "grad_norm": 2.578125, "grad_norm_var": 0.008983357747395834, "learning_rate": 0.0001, "loss": 7.3847, "loss/crossentropy": 2.383857250213623, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.22510264068841934, "step": 7640 }, { "epoch": 0.477625, "grad_norm": 2.328125, "grad_norm_var": 0.010868326822916666, "learning_rate": 0.0001, "loss": 7.3618, "loss/crossentropy": 2.2717517614364624, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2107211798429489, "step": 7642 }, { "epoch": 0.47775, "grad_norm": 2.8125, "grad_norm_var": 0.015478515625, "learning_rate": 0.0001, "loss": 7.5132, "loss/crossentropy": 2.28407883644104, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22158857434988022, "step": 7644 }, { "epoch": 0.477875, "grad_norm": 2.65625, "grad_norm_var": 0.028059895833333334, "learning_rate": 0.0001, "loss": 7.5097, "loss/crossentropy": 2.354778289794922, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2382299154996872, "step": 7646 }, { "epoch": 0.478, "grad_norm": 2.515625, "grad_norm_var": 0.028645833333333332, "learning_rate": 0.0001, "loss": 7.3457, "loss/crossentropy": 2.1595723628997803, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21302711963653564, "step": 7648 }, { "epoch": 0.478125, "grad_norm": 2.515625, "grad_norm_var": 0.027855428059895833, "learning_rate": 0.0001, "loss": 7.2365, "loss/crossentropy": 2.0668729543685913, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.19729968905448914, "step": 7650 }, { "epoch": 0.47825, "grad_norm": 2.53125, "grad_norm_var": 0.026178995768229168, "learning_rate": 0.0001, "loss": 7.7063, "loss/crossentropy": 2.2035917043685913, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.23489320278167725, "step": 7652 }, { "epoch": 0.478375, "grad_norm": 2.5, "grad_norm_var": 0.026105753580729165, "learning_rate": 0.0001, "loss": 7.6605, "loss/crossentropy": 2.1541595458984375, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.22334295511245728, "step": 7654 }, { "epoch": 0.4785, "grad_norm": 2.546875, "grad_norm_var": 0.026764933268229166, "learning_rate": 0.0001, "loss": 7.4256, "loss/crossentropy": 2.4176862239837646, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2326817810535431, "step": 7656 }, { "epoch": 0.478625, "grad_norm": 2.65625, "grad_norm_var": 0.0205078125, "learning_rate": 0.0001, "loss": 7.6502, "loss/crossentropy": 2.4046895503997803, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22558057308197021, "step": 7658 }, { "epoch": 0.47875, "grad_norm": 2.5, "grad_norm_var": 0.018626912434895834, "learning_rate": 0.0001, "loss": 7.4103, "loss/crossentropy": 2.223700165748596, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.22808054089546204, "step": 7660 }, { "epoch": 0.478875, "grad_norm": 2.71875, "grad_norm_var": 0.005757649739583333, "learning_rate": 0.0001, "loss": 7.6832, "loss/crossentropy": 2.471190333366394, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23807856440544128, "step": 7662 }, { "epoch": 0.479, "grad_norm": 2.71875, "grad_norm_var": 0.007222493489583333, "learning_rate": 0.0001, "loss": 7.5606, "loss/crossentropy": 2.371188998222351, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24270819127559662, "step": 7664 }, { "epoch": 0.479125, "grad_norm": 2.5, "grad_norm_var": 0.011356608072916666, "learning_rate": 0.0001, "loss": 7.5801, "loss/crossentropy": 2.3195821046829224, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24429184198379517, "step": 7666 }, { "epoch": 0.47925, "grad_norm": 2.4375, "grad_norm_var": 0.017769368489583333, "learning_rate": 0.0001, "loss": 7.3231, "loss/crossentropy": 2.1542683839797974, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20967034995555878, "step": 7668 }, { "epoch": 0.479375, "grad_norm": 2.375, "grad_norm_var": 0.020426432291666668, "learning_rate": 0.0001, "loss": 7.2221, "loss/crossentropy": 2.2892602682113647, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.24644708633422852, "step": 7670 }, { "epoch": 0.4795, "grad_norm": 2.5, "grad_norm_var": 0.019710286458333334, "learning_rate": 0.0001, "loss": 7.5469, "loss/crossentropy": 2.3171985149383545, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.22776274383068085, "step": 7672 }, { "epoch": 0.479625, "grad_norm": 2.484375, "grad_norm_var": 0.021773274739583334, "learning_rate": 0.0001, "loss": 7.3813, "loss/crossentropy": 2.278432607650757, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.22575496137142181, "step": 7674 }, { "epoch": 0.47975, "grad_norm": 2.75, "grad_norm_var": 0.023567708333333333, "learning_rate": 0.0001, "loss": 7.4644, "loss/crossentropy": 2.256859540939331, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22266656160354614, "step": 7676 }, { "epoch": 0.479875, "grad_norm": 2.6875, "grad_norm_var": 0.024714152018229168, "learning_rate": 0.0001, "loss": 7.465, "loss/crossentropy": 2.339644193649292, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23373190313577652, "step": 7678 }, { "epoch": 0.48, "grad_norm": 2.46875, "grad_norm_var": 0.02255859375, "learning_rate": 0.0001, "loss": 7.2745, "loss/crossentropy": 2.1255003213882446, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2173672914505005, "step": 7680 }, { "epoch": 0.480125, "grad_norm": 2.890625, "grad_norm_var": 0.023661295572916668, "learning_rate": 0.0001, "loss": 7.5856, "loss/crossentropy": 2.2506037950515747, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.22600781917572021, "step": 7682 }, { "epoch": 0.48025, "grad_norm": 2.59375, "grad_norm_var": 0.018456013997395833, "learning_rate": 0.0001, "loss": 7.6097, "loss/crossentropy": 2.1004857420921326, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2149680256843567, "step": 7684 }, { "epoch": 0.480375, "grad_norm": 2.875, "grad_norm_var": 0.0194732666015625, "learning_rate": 0.0001, "loss": 7.5167, "loss/crossentropy": 2.1856456995010376, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.23217922449111938, "step": 7686 }, { "epoch": 0.4805, "grad_norm": 2.3125, "grad_norm_var": 0.024909464518229167, "learning_rate": 0.0001, "loss": 7.4797, "loss/crossentropy": 2.2617682218551636, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2151031494140625, "step": 7688 }, { "epoch": 0.480625, "grad_norm": 2.421875, "grad_norm_var": 0.026529947916666668, "learning_rate": 0.0001, "loss": 7.437, "loss/crossentropy": 2.3735666275024414, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.2296081706881523, "step": 7690 }, { "epoch": 0.48075, "grad_norm": 2.5625, "grad_norm_var": 0.0254302978515625, "learning_rate": 0.0001, "loss": 7.5429, "loss/crossentropy": 2.3900094032287598, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21757236123085022, "step": 7692 }, { "epoch": 0.480875, "grad_norm": 2.453125, "grad_norm_var": 0.025194295247395835, "learning_rate": 0.0001, "loss": 7.4433, "loss/crossentropy": 2.2048407793045044, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.20764882862567902, "step": 7694 }, { "epoch": 0.481, "grad_norm": 2.5, "grad_norm_var": 0.025227864583333332, "learning_rate": 0.0001, "loss": 7.466, "loss/crossentropy": 2.3645886182785034, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22343488037586212, "step": 7696 }, { "epoch": 0.481125, "grad_norm": 2.53125, "grad_norm_var": 0.019950358072916667, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 2.226369559764862, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.20470194518566132, "step": 7698 }, { "epoch": 0.48125, "grad_norm": 2.609375, "grad_norm_var": 0.020531209309895833, "learning_rate": 0.0001, "loss": 7.1426, "loss/crossentropy": 2.2123297452926636, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.20936231315135956, "step": 7700 }, { "epoch": 0.481375, "grad_norm": 2.53125, "grad_norm_var": 0.0131988525390625, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.328381061553955, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.21627917885780334, "step": 7702 }, { "epoch": 0.4815, "grad_norm": 2.515625, "grad_norm_var": 0.009989420572916666, "learning_rate": 0.0001, "loss": 7.3954, "loss/crossentropy": 2.4912930727005005, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2397535890340805, "step": 7704 }, { "epoch": 0.481625, "grad_norm": 2.59375, "grad_norm_var": 0.005887858072916667, "learning_rate": 0.0001, "loss": 7.3218, "loss/crossentropy": 2.2447816133499146, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.20891211181879044, "step": 7706 }, { "epoch": 0.48175, "grad_norm": 2.40625, "grad_norm_var": 0.006883748372395833, "learning_rate": 0.0001, "loss": 7.3095, "loss/crossentropy": 2.4504369497299194, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.21755102276802063, "step": 7708 }, { "epoch": 0.481875, "grad_norm": 2.609375, "grad_norm_var": 0.00654296875, "learning_rate": 0.0001, "loss": 7.2964, "loss/crossentropy": 2.062195658683777, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21777213364839554, "step": 7710 }, { "epoch": 0.482, "grad_norm": 2.578125, "grad_norm_var": 0.010205078125, "learning_rate": 0.0001, "loss": 7.3492, "loss/crossentropy": 2.130585551261902, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21184702962636948, "step": 7712 }, { "epoch": 0.482125, "grad_norm": 2.6875, "grad_norm_var": 0.0116363525390625, "learning_rate": 0.0001, "loss": 7.7261, "loss/crossentropy": 2.2480320930480957, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.2256017103791237, "step": 7714 }, { "epoch": 0.48225, "grad_norm": 2.765625, "grad_norm_var": 0.017378743489583334, "learning_rate": 0.0001, "loss": 7.5843, "loss/crossentropy": 2.288808822631836, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.20400400459766388, "step": 7716 }, { "epoch": 0.482375, "grad_norm": 2.4375, "grad_norm_var": 0.022119140625, "learning_rate": 0.0001, "loss": 7.4667, "loss/crossentropy": 2.3060598373413086, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.2185887172818184, "step": 7718 }, { "epoch": 0.4825, "grad_norm": 2.484375, "grad_norm_var": 0.0204010009765625, "learning_rate": 0.0001, "loss": 7.4278, "loss/crossentropy": 2.4010475873947144, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2346554398536682, "step": 7720 }, { "epoch": 0.482625, "grad_norm": 2.4375, "grad_norm_var": 0.021825154622395832, "learning_rate": 0.0001, "loss": 7.2517, "loss/crossentropy": 2.2389672994613647, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2268006056547165, "step": 7722 }, { "epoch": 0.48275, "grad_norm": 2.59375, "grad_norm_var": 0.019701131184895835, "learning_rate": 0.0001, "loss": 7.574, "loss/crossentropy": 2.4582513570785522, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2260364592075348, "step": 7724 }, { "epoch": 0.482875, "grad_norm": 2.453125, "grad_norm_var": 0.017899576822916666, "learning_rate": 0.0001, "loss": 7.3459, "loss/crossentropy": 2.1185667514801025, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2260960265994072, "step": 7726 }, { "epoch": 0.483, "grad_norm": 2.453125, "grad_norm_var": 0.017650349934895834, "learning_rate": 0.0001, "loss": 7.3828, "loss/crossentropy": 2.3576878905296326, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21902261674404144, "step": 7728 }, { "epoch": 0.483125, "grad_norm": 2.609375, "grad_norm_var": 0.0191314697265625, "learning_rate": 0.0001, "loss": 7.2565, "loss/crossentropy": 2.0476362705230713, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.20675259828567505, "step": 7730 }, { "epoch": 0.48325, "grad_norm": 2.984375, "grad_norm_var": 0.025593058268229166, "learning_rate": 0.0001, "loss": 7.5223, "loss/crossentropy": 2.1531362533569336, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22793357074260712, "step": 7732 }, { "epoch": 0.483375, "grad_norm": 2.703125, "grad_norm_var": 0.026025390625, "learning_rate": 0.0001, "loss": 7.7495, "loss/crossentropy": 2.3575661182403564, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.23020553588867188, "step": 7734 }, { "epoch": 0.4835, "grad_norm": 2.609375, "grad_norm_var": 0.0252593994140625, "learning_rate": 0.0001, "loss": 7.4587, "loss/crossentropy": 2.401484966278076, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.2349899485707283, "step": 7736 }, { "epoch": 0.483625, "grad_norm": 2.703125, "grad_norm_var": 0.023932902018229167, "learning_rate": 0.0001, "loss": 7.4563, "loss/crossentropy": 2.2744656801223755, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.2291347235441208, "step": 7738 }, { "epoch": 0.48375, "grad_norm": 2.53125, "grad_norm_var": 0.0242584228515625, "learning_rate": 0.0001, "loss": 7.393, "loss/crossentropy": 2.123945474624634, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.22436197847127914, "step": 7740 }, { "epoch": 0.483875, "grad_norm": 2.578125, "grad_norm_var": 0.02261962890625, "learning_rate": 0.0001, "loss": 7.4852, "loss/crossentropy": 2.08186012506485, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22255352139472961, "step": 7742 }, { "epoch": 0.484, "grad_norm": 2.6875, "grad_norm_var": 0.0221343994140625, "learning_rate": 0.0001, "loss": 7.6033, "loss/crossentropy": 2.4600019454956055, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23556332290172577, "step": 7744 }, { "epoch": 0.484125, "grad_norm": 2.5625, "grad_norm_var": 0.0175689697265625, "learning_rate": 0.0001, "loss": 7.6798, "loss/crossentropy": 2.4723145961761475, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23074186593294144, "step": 7746 }, { "epoch": 0.48425, "grad_norm": 2.59375, "grad_norm_var": 0.011302693684895834, "learning_rate": 0.0001, "loss": 7.5287, "loss/crossentropy": 2.440352201461792, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2461652085185051, "step": 7748 }, { "epoch": 0.484375, "grad_norm": 2.515625, "grad_norm_var": 0.010155232747395833, "learning_rate": 0.0001, "loss": 7.4027, "loss/crossentropy": 2.129585385322571, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.21456780284643173, "step": 7750 }, { "epoch": 0.4845, "grad_norm": 2.734375, "grad_norm_var": 0.011872355143229167, "learning_rate": 0.0001, "loss": 7.8, "loss/crossentropy": 2.343677043914795, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23485040664672852, "step": 7752 }, { "epoch": 0.484625, "grad_norm": 2.625, "grad_norm_var": 0.011384073893229167, "learning_rate": 0.0001, "loss": 7.5342, "loss/crossentropy": 2.139711618423462, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.21154740452766418, "step": 7754 }, { "epoch": 0.48475, "grad_norm": 2.671875, "grad_norm_var": 0.013434855143229167, "learning_rate": 0.0001, "loss": 7.5941, "loss/crossentropy": 2.343800902366638, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22716201841831207, "step": 7756 }, { "epoch": 0.484875, "grad_norm": 2.515625, "grad_norm_var": 0.015111287434895834, "learning_rate": 0.0001, "loss": 7.4082, "loss/crossentropy": 2.2813916206359863, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.2133931964635849, "step": 7758 }, { "epoch": 0.485, "grad_norm": 2.53125, "grad_norm_var": 0.011962890625, "learning_rate": 0.0001, "loss": 7.4294, "loss/crossentropy": 2.287988066673279, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21577558666467667, "step": 7760 }, { "epoch": 0.485125, "grad_norm": 2.46875, "grad_norm_var": 0.011702473958333333, "learning_rate": 0.0001, "loss": 7.4644, "loss/crossentropy": 2.1854101419448853, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.23755793273448944, "step": 7762 }, { "epoch": 0.48525, "grad_norm": 2.4375, "grad_norm_var": 0.011617024739583334, "learning_rate": 0.0001, "loss": 7.3439, "loss/crossentropy": 2.1397287845611572, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20280467718839645, "step": 7764 }, { "epoch": 0.485375, "grad_norm": 2.515625, "grad_norm_var": 0.011839803059895833, "learning_rate": 0.0001, "loss": 7.3688, "loss/crossentropy": 2.298658847808838, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22496476024389267, "step": 7766 }, { "epoch": 0.4855, "grad_norm": 2.21875, "grad_norm_var": 0.013134765625, "learning_rate": 0.0001, "loss": 7.1846, "loss/crossentropy": 2.1197171211242676, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.19107449054718018, "step": 7768 }, { "epoch": 0.485625, "grad_norm": 2.953125, "grad_norm_var": 0.026460774739583335, "learning_rate": 0.0001, "loss": 7.4421, "loss/crossentropy": 2.2282586097717285, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22302527725696564, "step": 7770 }, { "epoch": 0.48575, "grad_norm": 2.5625, "grad_norm_var": 0.025179036458333335, "learning_rate": 0.0001, "loss": 7.4854, "loss/crossentropy": 2.3631385564804077, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.2319716364145279, "step": 7772 }, { "epoch": 0.485875, "grad_norm": 2.4375, "grad_norm_var": 0.0246002197265625, "learning_rate": 0.0001, "loss": 7.2064, "loss/crossentropy": 2.2677615880966187, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22639983892440796, "step": 7774 }, { "epoch": 0.486, "grad_norm": 2.546875, "grad_norm_var": 0.0266265869140625, "learning_rate": 0.0001, "loss": 7.6314, "loss/crossentropy": 2.1911749839782715, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2200656533241272, "step": 7776 }, { "epoch": 0.486125, "grad_norm": 2.3125, "grad_norm_var": 0.028197224934895834, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.172629475593567, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.21256321668624878, "step": 7778 }, { "epoch": 0.48625, "grad_norm": 2.640625, "grad_norm_var": 0.028718058268229166, "learning_rate": 0.0001, "loss": 7.6268, "loss/crossentropy": 2.1926246881484985, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2267642542719841, "step": 7780 }, { "epoch": 0.486375, "grad_norm": 2.46875, "grad_norm_var": 0.028934733072916666, "learning_rate": 0.0001, "loss": 7.3703, "loss/crossentropy": 2.4596643447875977, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.237305149435997, "step": 7782 }, { "epoch": 0.4865, "grad_norm": 2.625, "grad_norm_var": 0.021678670247395834, "learning_rate": 0.0001, "loss": 7.4708, "loss/crossentropy": 2.0642203092575073, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21862874925136566, "step": 7784 }, { "epoch": 0.486625, "grad_norm": 2.625, "grad_norm_var": 0.01064453125, "learning_rate": 0.0001, "loss": 7.5104, "loss/crossentropy": 2.2006276845932007, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23672595620155334, "step": 7786 }, { "epoch": 0.48675, "grad_norm": 2.625, "grad_norm_var": 0.01158447265625, "learning_rate": 0.0001, "loss": 7.3535, "loss/crossentropy": 2.3337786197662354, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2141759693622589, "step": 7788 }, { "epoch": 0.486875, "grad_norm": 2.390625, "grad_norm_var": 0.0138824462890625, "learning_rate": 0.0001, "loss": 7.5069, "loss/crossentropy": 2.4625933170318604, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22616465389728546, "step": 7790 }, { "epoch": 0.487, "grad_norm": 2.5625, "grad_norm_var": 0.011881510416666666, "learning_rate": 0.0001, "loss": 7.6336, "loss/crossentropy": 2.336500644683838, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.23317137360572815, "step": 7792 }, { "epoch": 0.487125, "grad_norm": 2.46875, "grad_norm_var": 0.007835896809895833, "learning_rate": 0.0001, "loss": 7.3971, "loss/crossentropy": 2.2294023036956787, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24116864055395126, "step": 7794 }, { "epoch": 0.48725, "grad_norm": 2.46875, "grad_norm_var": 0.009626261393229167, "learning_rate": 0.0001, "loss": 7.2151, "loss/crossentropy": 2.176442503929138, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.18946021795272827, "step": 7796 }, { "epoch": 0.487375, "grad_norm": 2.515625, "grad_norm_var": 0.008365885416666666, "learning_rate": 0.0001, "loss": 7.4888, "loss/crossentropy": 2.3833093643188477, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23858694732189178, "step": 7798 }, { "epoch": 0.4875, "grad_norm": 2.375, "grad_norm_var": 0.0103179931640625, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 2.1194872856140137, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21685560047626495, "step": 7800 }, { "epoch": 0.487625, "grad_norm": 2.46875, "grad_norm_var": 0.010054524739583333, "learning_rate": 0.0001, "loss": 7.3416, "loss/crossentropy": 2.1309744119644165, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2185322642326355, "step": 7802 }, { "epoch": 0.48775, "grad_norm": 2.609375, "grad_norm_var": 0.0113922119140625, "learning_rate": 0.0001, "loss": 7.5563, "loss/crossentropy": 2.103081226348877, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21970642358064651, "step": 7804 }, { "epoch": 0.487875, "grad_norm": 2.46875, "grad_norm_var": 0.008210245768229167, "learning_rate": 0.0001, "loss": 7.4852, "loss/crossentropy": 2.311703085899353, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2078799456357956, "step": 7806 }, { "epoch": 0.488, "grad_norm": 2.46875, "grad_norm_var": 0.008024088541666667, "learning_rate": 0.0001, "loss": 7.3877, "loss/crossentropy": 2.291482925415039, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.20819032937288284, "step": 7808 }, { "epoch": 0.488125, "grad_norm": 2.765625, "grad_norm_var": 0.0131744384765625, "learning_rate": 0.0001, "loss": 7.4384, "loss/crossentropy": 2.17020845413208, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.2530314475297928, "step": 7810 }, { "epoch": 0.48825, "grad_norm": 2.609375, "grad_norm_var": 0.01402587890625, "learning_rate": 0.0001, "loss": 7.5244, "loss/crossentropy": 2.504571557044983, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2497716173529625, "step": 7812 }, { "epoch": 0.488375, "grad_norm": 2.515625, "grad_norm_var": 0.0143218994140625, "learning_rate": 0.0001, "loss": 7.2297, "loss/crossentropy": 2.1718627214431763, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.22571015357971191, "step": 7814 }, { "epoch": 0.4885, "grad_norm": 2.625, "grad_norm_var": 0.0144195556640625, "learning_rate": 0.0001, "loss": 7.5706, "loss/crossentropy": 2.4650830030441284, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.23497213423252106, "step": 7816 }, { "epoch": 0.488625, "grad_norm": 2.421875, "grad_norm_var": 0.0163482666015625, "learning_rate": 0.0001, "loss": 7.3129, "loss/crossentropy": 2.2739332914352417, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.22801898419857025, "step": 7818 }, { "epoch": 0.48875, "grad_norm": 2.46875, "grad_norm_var": 0.01324462890625, "learning_rate": 0.0001, "loss": 7.5592, "loss/crossentropy": 2.1486986875534058, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2250107303261757, "step": 7820 }, { "epoch": 0.488875, "grad_norm": 2.5625, "grad_norm_var": 0.013081868489583334, "learning_rate": 0.0001, "loss": 7.3773, "loss/crossentropy": 2.16861629486084, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20403727889060974, "step": 7822 }, { "epoch": 0.489, "grad_norm": 2.46875, "grad_norm_var": 0.012007649739583333, "learning_rate": 0.0001, "loss": 7.3763, "loss/crossentropy": 2.262635350227356, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2124805599451065, "step": 7824 }, { "epoch": 0.489125, "grad_norm": 2.71875, "grad_norm_var": 0.009012858072916666, "learning_rate": 0.0001, "loss": 7.5229, "loss/crossentropy": 2.1440268754959106, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21637984365224838, "step": 7826 }, { "epoch": 0.48925, "grad_norm": 2.625, "grad_norm_var": 0.009357706705729166, "learning_rate": 0.0001, "loss": 7.4388, "loss/crossentropy": 2.2474324703216553, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21895693242549896, "step": 7828 }, { "epoch": 0.489375, "grad_norm": 2.703125, "grad_norm_var": 0.0182769775390625, "learning_rate": 0.0001, "loss": 7.5754, "loss/crossentropy": 2.237038493156433, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2165493667125702, "step": 7830 }, { "epoch": 0.4895, "grad_norm": 2.53125, "grad_norm_var": 0.0158355712890625, "learning_rate": 0.0001, "loss": 7.4989, "loss/crossentropy": 2.051880121231079, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.23154912143945694, "step": 7832 }, { "epoch": 0.489625, "grad_norm": 3.0625, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 7.6133, "loss/crossentropy": 2.5156235694885254, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.24127346277236938, "step": 7834 }, { "epoch": 0.48975, "grad_norm": 2.484375, "grad_norm_var": 0.0266754150390625, "learning_rate": 0.0001, "loss": 7.4246, "loss/crossentropy": 2.247938871383667, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.2116817906498909, "step": 7836 }, { "epoch": 0.489875, "grad_norm": 2.9375, "grad_norm_var": 0.03176676432291667, "learning_rate": 0.0001, "loss": 7.3995, "loss/crossentropy": 2.2224138975143433, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2244555950164795, "step": 7838 }, { "epoch": 0.49, "grad_norm": 2.5, "grad_norm_var": 0.033036295572916666, "learning_rate": 0.0001, "loss": 7.5834, "loss/crossentropy": 2.0298678278923035, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.21158085018396378, "step": 7840 }, { "epoch": 0.490125, "grad_norm": 2.5, "grad_norm_var": 0.034911092122395834, "learning_rate": 0.0001, "loss": 7.4247, "loss/crossentropy": 2.434661865234375, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.2253919243812561, "step": 7842 }, { "epoch": 0.49025, "grad_norm": 2.59375, "grad_norm_var": 0.03404947916666667, "learning_rate": 0.0001, "loss": 7.52, "loss/crossentropy": 2.3960059881210327, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22589298337697983, "step": 7844 }, { "epoch": 0.490375, "grad_norm": 2.71875, "grad_norm_var": 0.030646769205729167, "learning_rate": 0.0001, "loss": 7.551, "loss/crossentropy": 2.119168698787689, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22482646256685257, "step": 7846 }, { "epoch": 0.4905, "grad_norm": 2.546875, "grad_norm_var": 0.03142903645833333, "learning_rate": 0.0001, "loss": 7.3938, "loss/crossentropy": 2.2026538848876953, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.2023685872554779, "step": 7848 }, { "epoch": 0.490625, "grad_norm": 2.515625, "grad_norm_var": 0.020035807291666666, "learning_rate": 0.0001, "loss": 7.3638, "loss/crossentropy": 2.396271586418152, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2176545038819313, "step": 7850 }, { "epoch": 0.49075, "grad_norm": 2.59375, "grad_norm_var": 0.018553670247395834, "learning_rate": 0.0001, "loss": 7.7629, "loss/crossentropy": 2.7069517374038696, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.24267833679914474, "step": 7852 }, { "epoch": 0.490875, "grad_norm": 2.734375, "grad_norm_var": 0.012007649739583333, "learning_rate": 0.0001, "loss": 7.5204, "loss/crossentropy": 2.6121277809143066, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22969362884759903, "step": 7854 }, { "epoch": 0.491, "grad_norm": 2.578125, "grad_norm_var": 0.0103912353515625, "learning_rate": 0.0001, "loss": 7.5675, "loss/crossentropy": 2.268145203590393, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2228652909398079, "step": 7856 }, { "epoch": 0.491125, "grad_norm": 2.421875, "grad_norm_var": 0.0138824462890625, "learning_rate": 0.0001, "loss": 7.3285, "loss/crossentropy": 2.476702570915222, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21155445277690887, "step": 7858 }, { "epoch": 0.49125, "grad_norm": 2.46875, "grad_norm_var": 0.0108306884765625, "learning_rate": 0.0001, "loss": 7.5535, "loss/crossentropy": 2.146826982498169, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.2226490005850792, "step": 7860 }, { "epoch": 0.491375, "grad_norm": 2.53125, "grad_norm_var": 0.008137003580729166, "learning_rate": 0.0001, "loss": 7.1327, "loss/crossentropy": 1.9189171195030212, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.20614615827798843, "step": 7862 }, { "epoch": 0.4915, "grad_norm": 2.53125, "grad_norm_var": 0.008934529622395833, "learning_rate": 0.0001, "loss": 7.4097, "loss/crossentropy": 2.064807176589966, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.21850252151489258, "step": 7864 }, { "epoch": 0.491625, "grad_norm": 2.859375, "grad_norm_var": 0.014778645833333333, "learning_rate": 0.0001, "loss": 7.4608, "loss/crossentropy": 2.271127939224243, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22959817200899124, "step": 7866 }, { "epoch": 0.49175, "grad_norm": 2.59375, "grad_norm_var": 0.017804972330729165, "learning_rate": 0.0001, "loss": 7.5456, "loss/crossentropy": 2.293729305267334, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.21977120637893677, "step": 7868 }, { "epoch": 0.491875, "grad_norm": 2.546875, "grad_norm_var": 0.018094889322916665, "learning_rate": 0.0001, "loss": 7.6587, "loss/crossentropy": 2.5222651958465576, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.2369082123041153, "step": 7870 }, { "epoch": 0.492, "grad_norm": 2.421875, "grad_norm_var": 0.0195953369140625, "learning_rate": 0.0001, "loss": 7.3447, "loss/crossentropy": 2.252456545829773, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.21486414223909378, "step": 7872 }, { "epoch": 0.492125, "grad_norm": 2.625, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 7.5087, "loss/crossentropy": 2.314074754714966, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.2242046743631363, "step": 7874 }, { "epoch": 0.49225, "grad_norm": 2.46875, "grad_norm_var": 0.018431599934895834, "learning_rate": 0.0001, "loss": 7.4127, "loss/crossentropy": 2.1850411891937256, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21031895279884338, "step": 7876 }, { "epoch": 0.492375, "grad_norm": 2.578125, "grad_norm_var": 0.017740885416666668, "learning_rate": 0.0001, "loss": 7.3183, "loss/crossentropy": 2.0519991517066956, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.2178593873977661, "step": 7878 }, { "epoch": 0.4925, "grad_norm": 2.765625, "grad_norm_var": 0.020166015625, "learning_rate": 0.0001, "loss": 7.3317, "loss/crossentropy": 2.397667646408081, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.22160862386226654, "step": 7880 }, { "epoch": 0.492625, "grad_norm": 2.71875, "grad_norm_var": 0.018504842122395834, "learning_rate": 0.0001, "loss": 7.5334, "loss/crossentropy": 2.5889744758605957, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.24872159957885742, "step": 7882 }, { "epoch": 0.49275, "grad_norm": 3.09375, "grad_norm_var": 0.0329742431640625, "learning_rate": 0.0001, "loss": 7.4614, "loss/crossentropy": 2.3188217878341675, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.2101772204041481, "step": 7884 }, { "epoch": 0.492875, "grad_norm": 2.578125, "grad_norm_var": 0.03137613932291667, "learning_rate": 0.0001, "loss": 7.379, "loss/crossentropy": 2.521532654762268, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.23337652534246445, "step": 7886 }, { "epoch": 0.493, "grad_norm": 3.046875, "grad_norm_var": 0.0341796875, "learning_rate": 0.0001, "loss": 7.7223, "loss/crossentropy": 2.261523962020874, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.22320561856031418, "step": 7888 }, { "epoch": 0.493125, "grad_norm": 2.484375, "grad_norm_var": 0.037060546875, "learning_rate": 0.0001, "loss": 7.4541, "loss/crossentropy": 2.1720380783081055, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21075033396482468, "step": 7890 }, { "epoch": 0.49325, "grad_norm": 2.59375, "grad_norm_var": 0.03866780598958333, "learning_rate": 0.0001, "loss": 7.4677, "loss/crossentropy": 2.2282215356826782, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.20405927300453186, "step": 7892 }, { "epoch": 0.493375, "grad_norm": 2.625, "grad_norm_var": 0.03394266764322917, "learning_rate": 0.0001, "loss": 7.549, "loss/crossentropy": 2.4842289686203003, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.230421245098114, "step": 7894 }, { "epoch": 0.4935, "grad_norm": 2.453125, "grad_norm_var": 0.03765869140625, "learning_rate": 0.0001, "loss": 7.2304, "loss/crossentropy": 2.154668688774109, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.2183084860444069, "step": 7896 }, { "epoch": 0.493625, "grad_norm": 2.359375, "grad_norm_var": 0.0506011962890625, "learning_rate": 0.0001, "loss": 7.221, "loss/crossentropy": 2.2599010467529297, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2262006402015686, "step": 7898 }, { "epoch": 0.49375, "grad_norm": 2.671875, "grad_norm_var": 0.037262980143229166, "learning_rate": 0.0001, "loss": 7.3612, "loss/crossentropy": 2.190765142440796, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22431591153144836, "step": 7900 }, { "epoch": 0.493875, "grad_norm": 2.484375, "grad_norm_var": 0.035497029622395836, "learning_rate": 0.0001, "loss": 7.3968, "loss/crossentropy": 2.4013020992279053, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23306995630264282, "step": 7902 }, { "epoch": 0.494, "grad_norm": 2.515625, "grad_norm_var": 0.015282185872395833, "learning_rate": 0.0001, "loss": 7.4013, "loss/crossentropy": 2.3497613668441772, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22705762088298798, "step": 7904 }, { "epoch": 0.494125, "grad_norm": 2.640625, "grad_norm_var": 0.012223307291666667, "learning_rate": 0.0001, "loss": 7.4019, "loss/crossentropy": 2.3468161821365356, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.23190080374479294, "step": 7906 }, { "epoch": 0.49425, "grad_norm": 2.546875, "grad_norm_var": 0.017829386393229167, "learning_rate": 0.0001, "loss": 7.7085, "loss/crossentropy": 2.2013564109802246, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.22383768111467361, "step": 7908 }, { "epoch": 0.494375, "grad_norm": 2.5, "grad_norm_var": 0.0163238525390625, "learning_rate": 0.0001, "loss": 7.1006, "loss/crossentropy": 2.218170404434204, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2111092135310173, "step": 7910 }, { "epoch": 0.4945, "grad_norm": 2.40625, "grad_norm_var": 0.016779581705729168, "learning_rate": 0.0001, "loss": 7.4947, "loss/crossentropy": 2.294690251350403, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.22086331248283386, "step": 7912 }, { "epoch": 0.494625, "grad_norm": 2.375, "grad_norm_var": 0.013460286458333333, "learning_rate": 0.0001, "loss": 7.581, "loss/crossentropy": 2.3820382356643677, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21924013644456863, "step": 7914 }, { "epoch": 0.49475, "grad_norm": 2.765625, "grad_norm_var": 0.016551717122395834, "learning_rate": 0.0001, "loss": 7.6102, "loss/crossentropy": 2.127217411994934, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.2482958808541298, "step": 7916 }, { "epoch": 0.494875, "grad_norm": 2.53125, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 7.3498, "loss/crossentropy": 1.9610443115234375, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.21413399279117584, "step": 7918 }, { "epoch": 0.495, "grad_norm": 2.46875, "grad_norm_var": 0.020182291666666668, "learning_rate": 0.0001, "loss": 7.456, "loss/crossentropy": 2.144046127796173, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.2141159251332283, "step": 7920 }, { "epoch": 0.495125, "grad_norm": 2.84375, "grad_norm_var": 0.0251953125, "learning_rate": 0.0001, "loss": 7.3899, "loss/crossentropy": 2.013357698917389, "loss/hidden": 2.9609375, "loss/jsd": 0.0, "loss/logits": 0.20812131464481354, "step": 7922 }, { "epoch": 0.49525, "grad_norm": 2.75, "grad_norm_var": 0.0252105712890625, "learning_rate": 0.0001, "loss": 7.5517, "loss/crossentropy": 2.3825769424438477, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.23610923439264297, "step": 7924 }, { "epoch": 0.495375, "grad_norm": 2.703125, "grad_norm_var": 0.0256988525390625, "learning_rate": 0.0001, "loss": 7.6248, "loss/crossentropy": 2.3519210815429688, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.231511652469635, "step": 7926 }, { "epoch": 0.4955, "grad_norm": 2.5625, "grad_norm_var": 0.02398681640625, "learning_rate": 0.0001, "loss": 7.4447, "loss/crossentropy": 2.31050181388855, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.23168903589248657, "step": 7928 }, { "epoch": 0.495625, "grad_norm": 2.5625, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 7.2137, "loss/crossentropy": 2.1562445163726807, "loss/hidden": 2.8828125, "loss/jsd": 0.0, "loss/logits": 0.20587949454784393, "step": 7930 }, { "epoch": 0.49575, "grad_norm": 2.515625, "grad_norm_var": 0.018473307291666668, "learning_rate": 0.0001, "loss": 7.5396, "loss/crossentropy": 2.169148564338684, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.22403699904680252, "step": 7932 }, { "epoch": 0.495875, "grad_norm": 2.625, "grad_norm_var": 0.012433878580729167, "learning_rate": 0.0001, "loss": 7.6196, "loss/crossentropy": 2.3977845907211304, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.23353763669729233, "step": 7934 }, { "epoch": 0.496, "grad_norm": 2.515625, "grad_norm_var": 0.013263956705729166, "learning_rate": 0.0001, "loss": 7.4943, "loss/crossentropy": 2.3904244899749756, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22647689282894135, "step": 7936 }, { "epoch": 0.496125, "grad_norm": 2.390625, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 7.6388, "loss/crossentropy": 2.4087865352630615, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.24698343873023987, "step": 7938 }, { "epoch": 0.49625, "grad_norm": 2.359375, "grad_norm_var": 0.0163238525390625, "learning_rate": 0.0001, "loss": 7.2329, "loss/crossentropy": 2.0448268055915833, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.19177532196044922, "step": 7940 }, { "epoch": 0.496375, "grad_norm": 2.5625, "grad_norm_var": 0.01666259765625, "learning_rate": 0.0001, "loss": 7.3077, "loss/crossentropy": 2.2140800952911377, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.21362647414207458, "step": 7942 }, { "epoch": 0.4965, "grad_norm": 2.484375, "grad_norm_var": 0.016357421875, "learning_rate": 0.0001, "loss": 7.3648, "loss/crossentropy": 2.161599636077881, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22595800459384918, "step": 7944 }, { "epoch": 0.496625, "grad_norm": 2.6875, "grad_norm_var": 0.017476399739583332, "learning_rate": 0.0001, "loss": 7.6153, "loss/crossentropy": 2.2285830974578857, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.23164545744657516, "step": 7946 }, { "epoch": 0.49675, "grad_norm": 2.625, "grad_norm_var": 0.017292277018229166, "learning_rate": 0.0001, "loss": 7.5055, "loss/crossentropy": 2.1004785895347595, "loss/hidden": 2.8984375, "loss/jsd": 0.0, "loss/logits": 0.21466633677482605, "step": 7948 }, { "epoch": 0.496875, "grad_norm": 2.5625, "grad_norm_var": 0.020807902018229168, "learning_rate": 0.0001, "loss": 7.6433, "loss/crossentropy": 2.3662983179092407, "loss/hidden": 2.953125, "loss/jsd": 0.0, "loss/logits": 0.2292175218462944, "step": 7950 }, { "epoch": 0.497, "grad_norm": 2.453125, "grad_norm_var": 0.02379150390625, "learning_rate": 0.0001, "loss": 7.4951, "loss/crossentropy": 2.3144887685775757, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.20862818509340286, "step": 7952 }, { "epoch": 0.497125, "grad_norm": 2.53125, "grad_norm_var": 0.01744384765625, "learning_rate": 0.0001, "loss": 7.3602, "loss/crossentropy": 2.235016703605652, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.224934421479702, "step": 7954 }, { "epoch": 0.49725, "grad_norm": 2.484375, "grad_norm_var": 0.014655558268229167, "learning_rate": 0.0001, "loss": 7.2794, "loss/crossentropy": 2.329625725746155, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23288895934820175, "step": 7956 }, { "epoch": 0.497375, "grad_norm": 2.875, "grad_norm_var": 0.0291168212890625, "learning_rate": 0.0001, "loss": 7.7376, "loss/crossentropy": 2.3779152631759644, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2660157233476639, "step": 7958 }, { "epoch": 0.4975, "grad_norm": 2.609375, "grad_norm_var": 0.026781209309895835, "learning_rate": 0.0001, "loss": 7.5011, "loss/crossentropy": 2.289755702018738, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.2581496238708496, "step": 7960 }, { "epoch": 0.497625, "grad_norm": 2.515625, "grad_norm_var": 0.028987630208333334, "learning_rate": 0.0001, "loss": 7.487, "loss/crossentropy": 2.343310594558716, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21825644373893738, "step": 7962 }, { "epoch": 0.49775, "grad_norm": 2.515625, "grad_norm_var": 0.0320953369140625, "learning_rate": 0.0001, "loss": 7.551, "loss/crossentropy": 2.373992443084717, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2342175841331482, "step": 7964 }, { "epoch": 0.497875, "grad_norm": 2.53125, "grad_norm_var": 0.0314605712890625, "learning_rate": 0.0001, "loss": 7.4193, "loss/crossentropy": 2.3504817485809326, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.21713367849588394, "step": 7966 }, { "epoch": 0.498, "grad_norm": 2.46875, "grad_norm_var": 0.029850260416666666, "learning_rate": 0.0001, "loss": 7.4289, "loss/crossentropy": 2.3387320041656494, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22403793036937714, "step": 7968 }, { "epoch": 0.498125, "grad_norm": 2.484375, "grad_norm_var": 0.030322265625, "learning_rate": 0.0001, "loss": 7.3587, "loss/crossentropy": 2.085926115512848, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.22341713309288025, "step": 7970 }, { "epoch": 0.49825, "grad_norm": 2.59375, "grad_norm_var": 0.0310943603515625, "learning_rate": 0.0001, "loss": 7.3515, "loss/crossentropy": 1.8860748410224915, "loss/hidden": 3.0, "loss/jsd": 0.0, "loss/logits": 0.21420737355947495, "step": 7972 }, { "epoch": 0.498375, "grad_norm": 2.484375, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 7.2707, "loss/crossentropy": 2.4079747200012207, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.23120195418596268, "step": 7974 }, { "epoch": 0.4985, "grad_norm": 2.5, "grad_norm_var": 0.009370930989583333, "learning_rate": 0.0001, "loss": 7.3572, "loss/crossentropy": 1.9960143566131592, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.23607300221920013, "step": 7976 }, { "epoch": 0.498625, "grad_norm": 2.734375, "grad_norm_var": 0.007933553059895833, "learning_rate": 0.0001, "loss": 7.5196, "loss/crossentropy": 2.304525852203369, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.24546398222446442, "step": 7978 }, { "epoch": 0.49875, "grad_norm": 2.359375, "grad_norm_var": 0.009601847330729166, "learning_rate": 0.0001, "loss": 7.1713, "loss/crossentropy": 2.1086556911468506, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.20619191229343414, "step": 7980 }, { "epoch": 0.498875, "grad_norm": 2.640625, "grad_norm_var": 0.010960896809895834, "learning_rate": 0.0001, "loss": 7.5151, "loss/crossentropy": 2.2201133966445923, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.2153434380888939, "step": 7982 }, { "epoch": 0.499, "grad_norm": 2.578125, "grad_norm_var": 0.011205037434895834, "learning_rate": 0.0001, "loss": 7.3042, "loss/crossentropy": 2.0850881338119507, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.21027522534132004, "step": 7984 }, { "epoch": 0.499125, "grad_norm": 2.6875, "grad_norm_var": 0.012678019205729167, "learning_rate": 0.0001, "loss": 7.4745, "loss/crossentropy": 2.120269536972046, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.22233043611049652, "step": 7986 }, { "epoch": 0.49925, "grad_norm": 2.453125, "grad_norm_var": 0.011961873372395833, "learning_rate": 0.0001, "loss": 7.2802, "loss/crossentropy": 2.251330018043518, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.21758969128131866, "step": 7988 }, { "epoch": 0.499375, "grad_norm": 2.421875, "grad_norm_var": 0.012984212239583333, "learning_rate": 0.0001, "loss": 7.5933, "loss/crossentropy": 2.318128824234009, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.23310749232769012, "step": 7990 }, { "epoch": 0.4995, "grad_norm": 2.46875, "grad_norm_var": 0.01328125, "learning_rate": 0.0001, "loss": 7.3615, "loss/crossentropy": 2.1891262531280518, "loss/hidden": 2.875, "loss/jsd": 0.0, "loss/logits": 0.2182084545493126, "step": 7992 }, { "epoch": 0.499625, "grad_norm": 2.5625, "grad_norm_var": 0.010114542643229167, "learning_rate": 0.0001, "loss": 7.4859, "loss/crossentropy": 2.1905835270881653, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.22370078414678574, "step": 7994 }, { "epoch": 0.49975, "grad_norm": 2.5625, "grad_norm_var": 0.008089192708333333, "learning_rate": 0.0001, "loss": 7.7101, "loss/crossentropy": 2.372948169708252, "loss/hidden": 3.0078125, "loss/jsd": 0.0, "loss/logits": 0.2249649539589882, "step": 7996 }, { "epoch": 0.499875, "grad_norm": 2.390625, "grad_norm_var": 0.009422810872395833, "learning_rate": 0.0001, "loss": 7.3266, "loss/crossentropy": 2.166934847831726, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.22332046926021576, "step": 7998 }, { "epoch": 0.5, "grad_norm": 2.5625, "grad_norm_var": 0.009749348958333333, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.2428027391433716, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.21538162976503372, "step": 8000 } ], "logging_steps": 2, "max_steps": 16000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.33181242621952e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }