| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2, |
| "eval_steps": 2000, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001, |
| "grad_norm": 10752.0, |
| "learning_rate": 1.9e-05, |
| "loss": 158.0638, |
| "loss/crossentropy": 14.456178283691406, |
| "loss/hidden": 18.91875, |
| "loss/jsd": 0.0, |
| "loss/logits": 12.539741969108581, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 3264.0, |
| "grad_norm_var": 13568954.666666666, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 129.8883, |
| "loss/crossentropy": 11.943150734901428, |
| "loss/hidden": 19.128125, |
| "loss/jsd": 0.0, |
| "loss/logits": 10.032073307037354, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 1824.0, |
| "grad_norm_var": 3372859.7333333334, |
| "learning_rate": 3.7e-05, |
| "loss": 100.0245, |
| "loss/crossentropy": 9.159896969795227, |
| "loss/hidden": 18.609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 7.277156031131744, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 604.0, |
| "grad_norm_var": 331110.3333333333, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 90.5579, |
| "loss/crossentropy": 8.28059525489807, |
| "loss/hidden": 18.39375, |
| "loss/jsd": 0.0, |
| "loss/logits": 6.247069478034973, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 1128.0, |
| "grad_norm_var": 60515.2, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 86.1966, |
| "loss/crossentropy": 8.01256047487259, |
| "loss/hidden": 18.175, |
| "loss/jsd": 0.0, |
| "loss/logits": 6.1038679599761965, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 1360.0, |
| "grad_norm_var": 67713.86666666667, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 82.9348, |
| "loss/crossentropy": 7.731317961215973, |
| "loss/hidden": 17.959375, |
| "loss/jsd": 0.0, |
| "loss/logits": 5.726186037063599, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 1016.0, |
| "grad_norm_var": 35902.933333333334, |
| "learning_rate": 7.3e-05, |
| "loss": 78.6625, |
| "loss/crossentropy": 7.318132603168488, |
| "loss/hidden": 17.8375, |
| "loss/jsd": 0.0, |
| "loss/logits": 5.322234338521957, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 836.0, |
| "grad_norm_var": 12856.466666666667, |
| "learning_rate": 8.200000000000001e-05, |
| "loss": 74.6, |
| "loss/crossentropy": 6.863537752628327, |
| "loss/hidden": 17.325, |
| "loss/jsd": 0.0, |
| "loss/logits": 4.851147556304932, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 1168.0, |
| "grad_norm_var": 38569.0, |
| "learning_rate": 9.1e-05, |
| "loss": 69.2648, |
| "loss/crossentropy": 6.536011290550232, |
| "loss/hidden": 16.871875, |
| "loss/jsd": 0.0, |
| "loss/logits": 4.729572284221649, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 956.0, |
| "grad_norm_var": 54132.26666666667, |
| "learning_rate": 0.0001, |
| "loss": 61.5492, |
| "loss/crossentropy": 5.978731215000153, |
| "loss/hidden": 15.9046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 4.037681633234024, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 494.0, |
| "grad_norm_var": 60329.066666666666, |
| "learning_rate": 0.0001, |
| "loss": 50.5696, |
| "loss/crossentropy": 5.069290089607239, |
| "loss/hidden": 13.9625, |
| "loss/jsd": 0.0, |
| "loss/logits": 3.04628010392189, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 242.0, |
| "grad_norm_var": 33342.59583333333, |
| "learning_rate": 0.0001, |
| "loss": 38.8513, |
| "loss/crossentropy": 4.116593188047409, |
| "loss/hidden": 12.21875, |
| "loss/jsd": 0.0, |
| "loss/logits": 2.207608225941658, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 189.0, |
| "grad_norm_var": 2268.9625, |
| "learning_rate": 0.0001, |
| "loss": 30.2934, |
| "loss/crossentropy": 3.6065172433853148, |
| "loss/hidden": 10.4703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 1.553831559419632, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 129.0, |
| "grad_norm_var": 428.78333333333336, |
| "learning_rate": 0.0001, |
| "loss": 25.4075, |
| "loss/crossentropy": 3.238997083902359, |
| "loss/hidden": 9.36875, |
| "loss/jsd": 0.0, |
| "loss/logits": 1.2455815717577934, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 147.0, |
| "grad_norm_var": 884.8666666666667, |
| "learning_rate": 0.0001, |
| "loss": 21.889, |
| "loss/crossentropy": 3.104075390100479, |
| "loss/hidden": 8.19296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.981781056523323, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 242.0, |
| "grad_norm_var": 1127.890625, |
| "learning_rate": 0.0001, |
| "loss": 19.3636, |
| "loss/crossentropy": 2.6487351998686792, |
| "loss/hidden": 7.96328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.862408060580492, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 139.0, |
| "grad_norm_var": 1720.690625, |
| "learning_rate": 0.0001, |
| "loss": 17.9103, |
| "loss/crossentropy": 2.944036450982094, |
| "loss/hidden": 7.28671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.7954695858061314, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 127.0, |
| "grad_norm_var": 1522.8958333333333, |
| "learning_rate": 0.0001, |
| "loss": 17.1787, |
| "loss/crossentropy": 2.7259451180696486, |
| "loss/hidden": 7.03046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.7603268466889859, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 155.0, |
| "grad_norm_var": 1390.2666666666667, |
| "learning_rate": 0.0001, |
| "loss": 16.3546, |
| "loss/crossentropy": 2.745239295065403, |
| "loss/hidden": 6.74765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.6926519803702831, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 164.0, |
| "grad_norm_var": 902.2666666666667, |
| "learning_rate": 0.0001, |
| "loss": 15.7972, |
| "loss/crossentropy": 2.6587735950946807, |
| "loss/hidden": 6.6234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.6642795346677304, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 173.0, |
| "grad_norm_var": 1056.5166666666667, |
| "learning_rate": 0.0001, |
| "loss": 15.4154, |
| "loss/crossentropy": 2.67086471170187, |
| "loss/hidden": 6.30390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.6120679222047329, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 168.0, |
| "grad_norm_var": 446.2291666666667, |
| "learning_rate": 0.0001, |
| "loss": 14.9164, |
| "loss/crossentropy": 2.8284773945808412, |
| "loss/hidden": 6.15546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.6234366297721863, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 187.0, |
| "grad_norm_var": 7334.5625, |
| "learning_rate": 0.0001, |
| "loss": 14.9531, |
| "loss/crossentropy": 2.716707041859627, |
| "loss/hidden": 6.196875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.6206937313079834, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 172.0, |
| "grad_norm_var": 6329.6625, |
| "learning_rate": 0.0001, |
| "loss": 14.4769, |
| "loss/crossentropy": 2.4854482382535936, |
| "loss/hidden": 6.05859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5394440380856395, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 1149239296.0, |
| "grad_norm_var": 8.254690576187568e+16, |
| "learning_rate": 0.0001, |
| "loss": 14.4045, |
| "loss/crossentropy": 2.717127138376236, |
| "loss/hidden": 5.9890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5883205510675907, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 149.0, |
| "grad_norm_var": 8.254691009067347e+16, |
| "learning_rate": 0.0001, |
| "loss": 13.9101, |
| "loss/crossentropy": 2.478851719200611, |
| "loss/hidden": 5.8765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5184394292533397, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 186.0, |
| "grad_norm_var": 930.0625, |
| "learning_rate": 0.0001, |
| "loss": 13.6027, |
| "loss/crossentropy": 2.553143638372421, |
| "loss/hidden": 5.75234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5368255846202373, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 172.0, |
| "grad_norm_var": 3805.0666666666666, |
| "learning_rate": 0.0001, |
| "loss": 13.9542, |
| "loss/crossentropy": 2.7657821238040925, |
| "loss/hidden": 5.8421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5587639883160591, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 135.0, |
| "grad_norm_var": 3837.616666666667, |
| "learning_rate": 0.0001, |
| "loss": 13.3979, |
| "loss/crossentropy": 2.4579825714230537, |
| "loss/hidden": 5.62421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5003356814384461, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 119.5, |
| "grad_norm_var": 4336.095833333334, |
| "learning_rate": 0.0001, |
| "loss": 13.372, |
| "loss/crossentropy": 2.4825384080410005, |
| "loss/hidden": 5.77734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5297574065625668, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 144.0, |
| "grad_norm_var": 2114.4333333333334, |
| "learning_rate": 0.0001, |
| "loss": 13.2199, |
| "loss/crossentropy": 2.6365180641412733, |
| "loss/hidden": 5.5484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5377178646624088, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 126.5, |
| "grad_norm_var": 885.75, |
| "learning_rate": 0.0001, |
| "loss": 13.022, |
| "loss/crossentropy": 2.41667592599988, |
| "loss/hidden": 5.5296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4934091318398714, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 113.0, |
| "grad_norm_var": 4216.623958333334, |
| "learning_rate": 0.0001, |
| "loss": 12.6825, |
| "loss/crossentropy": 2.6186458706855773, |
| "loss/hidden": 5.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.48778619766235354, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 154.0, |
| "grad_norm_var": 637.090625, |
| "learning_rate": 0.0001, |
| "loss": 12.6415, |
| "loss/crossentropy": 2.6686057686805724, |
| "loss/hidden": 5.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4940062865614891, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 144.0, |
| "grad_norm_var": 2633.765625, |
| "learning_rate": 0.0001, |
| "loss": 12.6064, |
| "loss/crossentropy": 2.52793410718441, |
| "loss/hidden": 5.21171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.45680325478315353, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 141.0, |
| "grad_norm_var": 2513.148958333333, |
| "learning_rate": 0.0001, |
| "loss": 12.508, |
| "loss/crossentropy": 2.445630243420601, |
| "loss/hidden": 5.31171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4673466898500919, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 146.0, |
| "grad_norm_var": 161.95729166666666, |
| "learning_rate": 0.0001, |
| "loss": 12.3383, |
| "loss/crossentropy": 2.432392257452011, |
| "loss/hidden": 5.2109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4600852273404598, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 122.5, |
| "grad_norm_var": 1555.340625, |
| "learning_rate": 0.0001, |
| "loss": 12.2486, |
| "loss/crossentropy": 2.448658475279808, |
| "loss/hidden": 5.29765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.47797103337943553, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 110.5, |
| "grad_norm_var": 159.92916666666667, |
| "learning_rate": 0.0001, |
| "loss": 11.9006, |
| "loss/crossentropy": 2.4291503965854644, |
| "loss/hidden": 5.01328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.43006020598113537, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 136.0, |
| "grad_norm_var": 175.37395833333332, |
| "learning_rate": 0.0001, |
| "loss": 11.9938, |
| "loss/crossentropy": 2.604290932416916, |
| "loss/hidden": 4.9828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4612982179969549, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 109.0, |
| "grad_norm_var": 170.09583333333333, |
| "learning_rate": 0.0001, |
| "loss": 11.8251, |
| "loss/crossentropy": 2.3994911506772043, |
| "loss/hidden": 5.03984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4143600896000862, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 122.0, |
| "grad_norm_var": 150.45729166666666, |
| "learning_rate": 0.0001, |
| "loss": 11.6797, |
| "loss/crossentropy": 2.428033410012722, |
| "loss/hidden": 4.96171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.41778192222118377, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 119.0, |
| "grad_norm_var": 125.55729166666667, |
| "learning_rate": 0.0001, |
| "loss": 11.7055, |
| "loss/crossentropy": 2.569334480166435, |
| "loss/hidden": 4.9921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4176106728613377, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 120.0, |
| "grad_norm_var": 186.67395833333333, |
| "learning_rate": 0.0001, |
| "loss": 11.5608, |
| "loss/crossentropy": 2.5353519901633264, |
| "loss/hidden": 4.82578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4004150029271841, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 111.5, |
| "grad_norm_var": 157.52395833333333, |
| "learning_rate": 0.0001, |
| "loss": 11.6926, |
| "loss/crossentropy": 2.539342051744461, |
| "loss/hidden": 4.9390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4505396105349064, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 126.0, |
| "grad_norm_var": 329.32916666666665, |
| "learning_rate": 0.0001, |
| "loss": 11.3179, |
| "loss/crossentropy": 2.4947912380099297, |
| "loss/hidden": 4.70703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.39311613626778125, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 130.0, |
| "grad_norm_var": 482.1958333333333, |
| "learning_rate": 0.0001, |
| "loss": 11.2995, |
| "loss/crossentropy": 2.522867926955223, |
| "loss/hidden": 4.778125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.39878650680184363, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 114.5, |
| "grad_norm_var": 159.2, |
| "learning_rate": 0.0001, |
| "loss": 11.1298, |
| "loss/crossentropy": 2.503119890391827, |
| "loss/hidden": 4.6859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4145892545580864, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 123.5, |
| "grad_norm_var": 2113.4625, |
| "learning_rate": 0.0001, |
| "loss": 11.0383, |
| "loss/crossentropy": 2.4039885073900225, |
| "loss/hidden": 4.658203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.37959295585751535, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 110.0, |
| "grad_norm_var": 1545.5291666666667, |
| "learning_rate": 0.0001, |
| "loss": 10.9564, |
| "loss/crossentropy": 2.3160028889775277, |
| "loss/hidden": 4.78359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4041217315942049, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 118.5, |
| "grad_norm_var": 1485.1572916666667, |
| "learning_rate": 0.0001, |
| "loss": 11.0273, |
| "loss/crossentropy": 2.3481629095971583, |
| "loss/hidden": 4.76796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.388704277202487, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 302.0, |
| "grad_norm_var": 4060.695833333333, |
| "learning_rate": 0.0001, |
| "loss": 10.8826, |
| "loss/crossentropy": 2.432570169866085, |
| "loss/hidden": 4.647265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4005543690174818, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 262.0, |
| "grad_norm_var": 5144.929166666667, |
| "learning_rate": 0.0001, |
| "loss": 10.9255, |
| "loss/crossentropy": 2.4078257739543916, |
| "loss/hidden": 4.51953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3619723778218031, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 111.5, |
| "grad_norm_var": 3058.195833333333, |
| "learning_rate": 0.0001, |
| "loss": 10.8513, |
| "loss/crossentropy": 2.1905623614788055, |
| "loss/hidden": 4.54921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3489991918206215, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 98.0, |
| "grad_norm_var": 2313.990625, |
| "learning_rate": 0.0001, |
| "loss": 10.8386, |
| "loss/crossentropy": 2.4719990983605387, |
| "loss/hidden": 4.63984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4116944268345833, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 105.0, |
| "grad_norm_var": 1808.315625, |
| "learning_rate": 0.0001, |
| "loss": 10.7797, |
| "loss/crossentropy": 2.381363682448864, |
| "loss/hidden": 4.58828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.38398357704281805, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 206.0, |
| "grad_norm_var": 1395.3, |
| "learning_rate": 0.0001, |
| "loss": 10.6643, |
| "loss/crossentropy": 2.531977267563343, |
| "loss/hidden": 4.6171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.37524734511971475, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 150.0, |
| "grad_norm_var": 1246.6333333333334, |
| "learning_rate": 0.0001, |
| "loss": 10.5081, |
| "loss/crossentropy": 2.391422814875841, |
| "loss/hidden": 4.46484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3587542846798897, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 110.5, |
| "grad_norm_var": 678.5333333333333, |
| "learning_rate": 0.0001, |
| "loss": 10.4338, |
| "loss/crossentropy": 2.267008524388075, |
| "loss/hidden": 4.356640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3174692545086145, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 135.0, |
| "grad_norm_var": 914.0489583333333, |
| "learning_rate": 0.0001, |
| "loss": 10.5236, |
| "loss/crossentropy": 2.3517861902713775, |
| "loss/hidden": 4.42265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3542962525039911, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 103.0, |
| "grad_norm_var": 904.1989583333333, |
| "learning_rate": 0.0001, |
| "loss": 10.345, |
| "loss/crossentropy": 2.3741147622466086, |
| "loss/hidden": 4.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3606201378628612, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 86.0, |
| "grad_norm_var": 624.25, |
| "learning_rate": 0.0001, |
| "loss": 10.4494, |
| "loss/crossentropy": 2.3786921083927153, |
| "loss/hidden": 4.291796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.33345147483050824, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 109.0, |
| "grad_norm_var": 580.0333333333333, |
| "learning_rate": 0.0001, |
| "loss": 10.1494, |
| "loss/crossentropy": 2.3835427895188332, |
| "loss/hidden": 4.328515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.33732542097568513, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 106.5, |
| "grad_norm_var": 407.5625, |
| "learning_rate": 0.0001, |
| "loss": 10.334, |
| "loss/crossentropy": 2.3970961540937425, |
| "loss/hidden": 4.486328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3739761531352997, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 127.0, |
| "grad_norm_var": 8.827054751968406e+17, |
| "learning_rate": 0.0001, |
| "loss": 10.3909, |
| "loss/crossentropy": 2.603018820285797, |
| "loss/hidden": 4.336328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.35302893407642844, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 103.0, |
| "grad_norm_var": 8.827054748993245e+17, |
| "learning_rate": 0.0001, |
| "loss": 10.3448, |
| "loss/crossentropy": 2.209125077724457, |
| "loss/hidden": 4.298046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3420632269233465, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 111.5, |
| "grad_norm_var": 190.75, |
| "learning_rate": 0.0001, |
| "loss": 10.1599, |
| "loss/crossentropy": 2.1904555816203355, |
| "loss/hidden": 4.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3357353564351797, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 88.5, |
| "grad_norm_var": 215.29895833333333, |
| "learning_rate": 0.0001, |
| "loss": 9.9371, |
| "loss/crossentropy": 2.3618984460830688, |
| "loss/hidden": 4.186328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.33678749240934847, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 95.0, |
| "grad_norm_var": 228.140625, |
| "learning_rate": 0.0001, |
| "loss": 10.0861, |
| "loss/crossentropy": 2.372377243638039, |
| "loss/hidden": 4.2109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3243491280823946, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 84.0, |
| "grad_norm_var": 520.1666666666666, |
| "learning_rate": 0.0001, |
| "loss": 10.2116, |
| "loss/crossentropy": 2.235209721326828, |
| "loss/hidden": 4.303515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.34188132397830484, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 103.0, |
| "grad_norm_var": 553.9291666666667, |
| "learning_rate": 0.0001, |
| "loss": 9.9575, |
| "loss/crossentropy": 2.3372152552008627, |
| "loss/hidden": 4.1390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3105729196220636, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 107.0, |
| "grad_norm_var": 538.540625, |
| "learning_rate": 0.0001, |
| "loss": 9.978, |
| "loss/crossentropy": 2.510573136806488, |
| "loss/hidden": 4.14453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3471809647977352, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 139.0, |
| "grad_norm_var": 493.49583333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.9677, |
| "loss/crossentropy": 2.3755437433719635, |
| "loss/hidden": 4.123828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3338810380548239, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 99.0, |
| "grad_norm_var": 286.8625, |
| "learning_rate": 0.0001, |
| "loss": 9.8714, |
| "loss/crossentropy": 2.3226330026984217, |
| "loss/hidden": 4.11015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.32580162063241, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 85.5, |
| "grad_norm_var": 425.8625, |
| "learning_rate": 0.0001, |
| "loss": 9.7891, |
| "loss/crossentropy": 2.3768628584221005, |
| "loss/hidden": 4.124609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31062583327293397, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 120.0, |
| "grad_norm_var": 373.765625, |
| "learning_rate": 0.0001, |
| "loss": 9.8455, |
| "loss/crossentropy": 2.4248126417398455, |
| "loss/hidden": 4.2359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3379279874265194, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 115.5, |
| "grad_norm_var": 366.765625, |
| "learning_rate": 0.0001, |
| "loss": 9.7894, |
| "loss/crossentropy": 2.2128719061613085, |
| "loss/hidden": 4.18515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3335044614970684, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 82.0, |
| "grad_norm_var": 207.05, |
| "learning_rate": 0.0001, |
| "loss": 9.7323, |
| "loss/crossentropy": 2.321111184358597, |
| "loss/hidden": 4.112109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30921670254319905, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 90.0, |
| "grad_norm_var": 321.65729166666665, |
| "learning_rate": 0.0001, |
| "loss": 9.7419, |
| "loss/crossentropy": 2.3887290723621843, |
| "loss/hidden": 4.17421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.34963752441108226, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 90.0, |
| "grad_norm_var": 1653.9958333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.6443, |
| "loss/crossentropy": 2.34355805516243, |
| "loss/hidden": 4.119140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3216205321252346, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 111.0, |
| "grad_norm_var": 1760.865625, |
| "learning_rate": 0.0001, |
| "loss": 9.7151, |
| "loss/crossentropy": 2.26568204164505, |
| "loss/hidden": 4.0734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3119744971394539, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 100.0, |
| "grad_norm_var": 365.0, |
| "learning_rate": 0.0001, |
| "loss": 9.6335, |
| "loss/crossentropy": 2.363439542800188, |
| "loss/hidden": 4.0421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3196489207446575, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 105.0, |
| "grad_norm_var": 725.840625, |
| "learning_rate": 0.0001, |
| "loss": 9.5683, |
| "loss/crossentropy": 2.25376470759511, |
| "loss/hidden": 4.040625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3137321826070547, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 91.0, |
| "grad_norm_var": 243.115625, |
| "learning_rate": 0.0001, |
| "loss": 9.6059, |
| "loss/crossentropy": 2.402809253334999, |
| "loss/hidden": 4.08359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3079391553997993, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 115.5, |
| "grad_norm_var": 52.3625, |
| "learning_rate": 0.0001, |
| "loss": 9.4809, |
| "loss/crossentropy": 2.3521162420511246, |
| "loss/hidden": 3.929296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3063440557569265, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 91.0, |
| "grad_norm_var": 109.71666666666667, |
| "learning_rate": 0.0001, |
| "loss": 9.6562, |
| "loss/crossentropy": 2.443948082625866, |
| "loss/hidden": 4.025390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.33005591817200186, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 99.0, |
| "grad_norm_var": 8.906043697083199e+17, |
| "learning_rate": 0.0001, |
| "loss": 9.6756, |
| "loss/crossentropy": 2.2569786101579665, |
| "loss/hidden": 4.169140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.32912670746445655, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 87.5, |
| "grad_norm_var": 8.90604369488119e+17, |
| "learning_rate": 0.0001, |
| "loss": 9.6822, |
| "loss/crossentropy": 2.542811484634876, |
| "loss/hidden": 3.961328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3259673956781626, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 117.0, |
| "grad_norm_var": 227.42395833333333, |
| "learning_rate": 0.0001, |
| "loss": 9.44, |
| "loss/crossentropy": 2.3939336955547335, |
| "loss/hidden": 3.878515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29817260801792145, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 79.0, |
| "grad_norm_var": 200.97395833333334, |
| "learning_rate": 0.0001, |
| "loss": 9.3573, |
| "loss/crossentropy": 2.496935114264488, |
| "loss/hidden": 4.0015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3248747974634171, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 97.5, |
| "grad_norm_var": 517.7, |
| "learning_rate": 0.0001, |
| "loss": 9.4559, |
| "loss/crossentropy": 2.245865948498249, |
| "loss/hidden": 3.951953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30880712568759916, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 93.0, |
| "grad_norm_var": 475.07395833333334, |
| "learning_rate": 0.0001, |
| "loss": 9.3572, |
| "loss/crossentropy": 2.3004986569285393, |
| "loss/hidden": 3.912890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2959143763408065, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 94.5, |
| "grad_norm_var": 139.9, |
| "learning_rate": 0.0001, |
| "loss": 9.461, |
| "loss/crossentropy": 2.360969065129757, |
| "loss/hidden": 3.9828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3106645856052637, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 102.5, |
| "grad_norm_var": 82.290625, |
| "learning_rate": 0.0001, |
| "loss": 9.3725, |
| "loss/crossentropy": 2.442077124118805, |
| "loss/hidden": 3.887109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30320504680275917, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 81.0, |
| "grad_norm_var": 283.8989583333333, |
| "learning_rate": 0.0001, |
| "loss": 9.215, |
| "loss/crossentropy": 2.2990706115961075, |
| "loss/hidden": 3.908203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2945917289704084, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 85.5, |
| "grad_norm_var": 935.5291666666667, |
| "learning_rate": 0.0001, |
| "loss": 9.3148, |
| "loss/crossentropy": 2.405318558216095, |
| "loss/hidden": 3.8734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29377752766013143, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 90.5, |
| "grad_norm_var": 745.3291666666667, |
| "learning_rate": 0.0001, |
| "loss": 9.2675, |
| "loss/crossentropy": 2.313190388679504, |
| "loss/hidden": 3.908203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3074024930596352, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 91.5, |
| "grad_norm_var": 74.38333333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.3473, |
| "loss/crossentropy": 2.4643412232398987, |
| "loss/hidden": 3.9109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31328765451908114, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 83.0, |
| "grad_norm_var": 77.24895833333333, |
| "learning_rate": 0.0001, |
| "loss": 9.1591, |
| "loss/crossentropy": 2.3321994699537756, |
| "loss/hidden": 3.794140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2842238027602434, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2919235584.0, |
| "grad_norm_var": 5.3262099304352365e+17, |
| "learning_rate": 0.0001, |
| "loss": 9.2499, |
| "loss/crossentropy": 2.24974425137043, |
| "loss/hidden": 3.69921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2656703107059002, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 83.0, |
| "grad_norm_var": 5.3262099137712704e+17, |
| "learning_rate": 0.0001, |
| "loss": 9.1036, |
| "loss/crossentropy": 2.248470115661621, |
| "loss/hidden": 3.834375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28389163631945846, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 99.5, |
| "grad_norm_var": 260.3958333333333, |
| "learning_rate": 0.0001, |
| "loss": 9.1529, |
| "loss/crossentropy": 2.177551028132439, |
| "loss/hidden": 3.85625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2901096811518073, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 107.0, |
| "grad_norm_var": 126.18333333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.2276, |
| "loss/crossentropy": 2.4588360369205473, |
| "loss/hidden": 3.81953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3054195210337639, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 92.5, |
| "grad_norm_var": 773.6822916666666, |
| "learning_rate": 0.0001, |
| "loss": 9.2522, |
| "loss/crossentropy": 2.36704108864069, |
| "loss/hidden": 3.98984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.32540309652686117, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 94.0, |
| "grad_norm_var": 747.9958333333333, |
| "learning_rate": 0.0001, |
| "loss": 9.1546, |
| "loss/crossentropy": 2.2803470581769942, |
| "loss/hidden": 3.805078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3206649195402861, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 74.0, |
| "grad_norm_var": 118.565625, |
| "learning_rate": 0.0001, |
| "loss": 9.1738, |
| "loss/crossentropy": 2.468463772535324, |
| "loss/hidden": 3.775390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3029760651290417, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 76.0, |
| "grad_norm_var": 112.42395833333333, |
| "learning_rate": 0.0001, |
| "loss": 9.0442, |
| "loss/crossentropy": 2.3093275628983974, |
| "loss/hidden": 3.8203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30387087166309357, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 92.5, |
| "grad_norm_var": 47.71666666666667, |
| "learning_rate": 0.0001, |
| "loss": 9.0691, |
| "loss/crossentropy": 2.3587117075920103, |
| "loss/hidden": 3.7921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2959397092461586, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 82.0, |
| "grad_norm_var": 82.57395833333334, |
| "learning_rate": 0.0001, |
| "loss": 9.0681, |
| "loss/crossentropy": 2.3668819189071657, |
| "loss/hidden": 3.873828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30734706819057467, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 98.0, |
| "grad_norm_var": 130.02916666666667, |
| "learning_rate": 0.0001, |
| "loss": 9.1895, |
| "loss/crossentropy": 2.4498503282666206, |
| "loss/hidden": 3.838671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30784521605819465, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 88.5, |
| "grad_norm_var": 92.05729166666667, |
| "learning_rate": 0.0001, |
| "loss": 9.2165, |
| "loss/crossentropy": 2.37082399725914, |
| "loss/hidden": 3.859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29330057725310327, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 89.5, |
| "grad_norm_var": 160.39895833333333, |
| "learning_rate": 0.0001, |
| "loss": 9.0963, |
| "loss/crossentropy": 2.245619586110115, |
| "loss/hidden": 3.839453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3090781785547733, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 89.0, |
| "grad_norm_var": 153.57395833333334, |
| "learning_rate": 0.0001, |
| "loss": 9.1747, |
| "loss/crossentropy": 2.254079730808735, |
| "loss/hidden": 3.863671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29664100557565687, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 85.5, |
| "grad_norm_var": 177.3625, |
| "learning_rate": 0.0001, |
| "loss": 8.9365, |
| "loss/crossentropy": 2.3813750982284545, |
| "loss/hidden": 3.841796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2955601759254932, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 97.0, |
| "grad_norm_var": 177.75, |
| "learning_rate": 0.0001, |
| "loss": 9.0288, |
| "loss/crossentropy": 2.317107746005058, |
| "loss/hidden": 3.7171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2740287099033594, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 84.0, |
| "grad_norm_var": 192.15, |
| "learning_rate": 0.0001, |
| "loss": 8.9149, |
| "loss/crossentropy": 2.2348272860050202, |
| "loss/hidden": 3.748828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26385229676961897, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 78.0, |
| "grad_norm_var": 139.8625, |
| "learning_rate": 0.0001, |
| "loss": 8.9416, |
| "loss/crossentropy": 2.186076807975769, |
| "loss/hidden": 3.68046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2610600605607033, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 80.5, |
| "grad_norm_var": 175.85, |
| "learning_rate": 0.0001, |
| "loss": 8.9542, |
| "loss/crossentropy": 2.258153685927391, |
| "loss/hidden": 3.740234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27120565343648195, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 79.0, |
| "grad_norm_var": 164.89583333333334, |
| "learning_rate": 0.0001, |
| "loss": 8.8167, |
| "loss/crossentropy": 2.4536369144916534, |
| "loss/hidden": 3.7125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28769057895988226, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 63.0, |
| "grad_norm_var": 103.565625, |
| "learning_rate": 0.0001, |
| "loss": 8.7058, |
| "loss/crossentropy": 2.2031524434685705, |
| "loss/hidden": 3.709375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2841499318368733, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 74.0, |
| "grad_norm_var": 117.23229166666667, |
| "learning_rate": 0.0001, |
| "loss": 8.8823, |
| "loss/crossentropy": 2.2541019685566424, |
| "loss/hidden": 3.725, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2822803447023034, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 75.5, |
| "grad_norm_var": 163.3625, |
| "learning_rate": 0.0001, |
| "loss": 8.7654, |
| "loss/crossentropy": 2.4589641630649566, |
| "loss/hidden": 3.77265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28896796628832816, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 83.0, |
| "grad_norm_var": 68.25, |
| "learning_rate": 0.0001, |
| "loss": 8.9438, |
| "loss/crossentropy": 2.2707848742604257, |
| "loss/hidden": 3.685546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2688711144030094, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 97.5, |
| "grad_norm_var": 75.89895833333334, |
| "learning_rate": 0.0001, |
| "loss": 8.8432, |
| "loss/crossentropy": 2.5097223311662673, |
| "loss/hidden": 3.656640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29047914147377013, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 89.0, |
| "grad_norm_var": 1450.2989583333333, |
| "learning_rate": 0.0001, |
| "loss": 8.8377, |
| "loss/crossentropy": 2.3170286387205126, |
| "loss/hidden": 3.7, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2755675740540028, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 65.0, |
| "grad_norm_var": 1693.0291666666667, |
| "learning_rate": 0.0001, |
| "loss": 8.6604, |
| "loss/crossentropy": 2.1438958957791328, |
| "loss/hidden": 3.639453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2576067052781582, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 74.0, |
| "grad_norm_var": 126.190625, |
| "learning_rate": 0.0001, |
| "loss": 8.8333, |
| "loss/crossentropy": 2.3025652706623077, |
| "loss/hidden": 3.691015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2775576956570148, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 71.5, |
| "grad_norm_var": 87.23229166666667, |
| "learning_rate": 0.0001, |
| "loss": 8.7094, |
| "loss/crossentropy": 2.13181097432971, |
| "loss/hidden": 3.702734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27296230792999265, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 113.0, |
| "grad_norm_var": 149.48229166666667, |
| "learning_rate": 0.0001, |
| "loss": 8.6782, |
| "loss/crossentropy": 2.1315632432699205, |
| "loss/hidden": 3.625390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2651492517441511, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 85.0, |
| "grad_norm_var": 111.440625, |
| "learning_rate": 0.0001, |
| "loss": 8.742, |
| "loss/crossentropy": 2.339846658706665, |
| "loss/hidden": 3.623828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2743611980229616, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 86.0, |
| "grad_norm_var": 122.965625, |
| "learning_rate": 0.0001, |
| "loss": 8.6397, |
| "loss/crossentropy": 2.2031438082456587, |
| "loss/hidden": 3.5578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2621523380279541, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 71.5, |
| "grad_norm_var": 132.10729166666667, |
| "learning_rate": 0.0001, |
| "loss": 8.7931, |
| "loss/crossentropy": 2.465841978788376, |
| "loss/hidden": 3.657421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29582356065511706, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 98.5, |
| "grad_norm_var": 136.34973958333333, |
| "learning_rate": 0.0001, |
| "loss": 8.7755, |
| "loss/crossentropy": 2.3093322798609734, |
| "loss/hidden": 3.675390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28201375566422937, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 87.0, |
| "grad_norm_var": 45.1625, |
| "learning_rate": 0.0001, |
| "loss": 8.8767, |
| "loss/crossentropy": 2.3267322540283204, |
| "loss/hidden": 3.687109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27597835548222066, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 78.5, |
| "grad_norm_var": 52.19583333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.7636, |
| "loss/crossentropy": 2.250748935341835, |
| "loss/hidden": 3.722265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.275000686571002, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 74.0, |
| "grad_norm_var": 77.68229166666667, |
| "learning_rate": 0.0001, |
| "loss": 8.8309, |
| "loss/crossentropy": 2.294243222475052, |
| "loss/hidden": 3.781640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29517283104360104, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 73.0, |
| "grad_norm_var": 70.565625, |
| "learning_rate": 0.0001, |
| "loss": 8.6486, |
| "loss/crossentropy": 2.4063815265893935, |
| "loss/hidden": 3.561328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27084620147943494, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 72.0, |
| "grad_norm_var": 160.365625, |
| "learning_rate": 0.0001, |
| "loss": 8.6319, |
| "loss/crossentropy": 2.0357704624533652, |
| "loss/hidden": 3.55390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24529488924890758, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 92.0, |
| "grad_norm_var": 159.2625, |
| "learning_rate": 0.0001, |
| "loss": 8.6773, |
| "loss/crossentropy": 2.207934172451496, |
| "loss/hidden": 3.626953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26206000819802283, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 92.0, |
| "grad_norm_var": 75.85, |
| "learning_rate": 0.0001, |
| "loss": 8.6142, |
| "loss/crossentropy": 2.2258728444576263, |
| "loss/hidden": 3.694140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2842423222959042, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 75.5, |
| "grad_norm_var": 69.590625, |
| "learning_rate": 0.0001, |
| "loss": 8.7049, |
| "loss/crossentropy": 2.405027574300766, |
| "loss/hidden": 3.594140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2595718756318092, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 175.0, |
| "grad_norm_var": 622.85, |
| "learning_rate": 0.0001, |
| "loss": 8.5144, |
| "loss/crossentropy": 2.3508727669715883, |
| "loss/hidden": 3.6578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2513396417722106, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 144.0, |
| "grad_norm_var": 827.8739583333333, |
| "learning_rate": 0.0001, |
| "loss": 8.64, |
| "loss/crossentropy": 2.158524568378925, |
| "loss/hidden": 3.666015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.25669998563826085, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 90.5, |
| "grad_norm_var": 339.35729166666664, |
| "learning_rate": 0.0001, |
| "loss": 8.5076, |
| "loss/crossentropy": 2.1952589228749275, |
| "loss/hidden": 3.490625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.246895507350564, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 65.5, |
| "grad_norm_var": 314.6333333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.6159, |
| "loss/crossentropy": 2.3050056755542756, |
| "loss/hidden": 3.553515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2641986530274153, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 76.5, |
| "grad_norm_var": 426.1166666666667, |
| "learning_rate": 0.0001, |
| "loss": 8.527, |
| "loss/crossentropy": 2.281977441906929, |
| "loss/hidden": 3.49296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2622336186468601, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 74.0, |
| "grad_norm_var": 278.69348958333336, |
| "learning_rate": 0.0001, |
| "loss": 8.6149, |
| "loss/crossentropy": 2.303273032605648, |
| "loss/hidden": 3.5671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2778003554791212, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 102.0, |
| "grad_norm_var": 134.70729166666666, |
| "learning_rate": 0.0001, |
| "loss": 8.4927, |
| "loss/crossentropy": 2.40097414329648, |
| "loss/hidden": 3.536328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27044865442439914, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 72.0, |
| "grad_norm_var": 87.8, |
| "learning_rate": 0.0001, |
| "loss": 8.4056, |
| "loss/crossentropy": 2.186897784471512, |
| "loss/hidden": 3.532421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24866797383874656, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 75.5, |
| "grad_norm_var": 133.09583333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.5426, |
| "loss/crossentropy": 2.311472164094448, |
| "loss/hidden": 3.53359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.25585599690675737, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 136.0, |
| "grad_norm_var": 258.7625, |
| "learning_rate": 0.0001, |
| "loss": 8.3875, |
| "loss/crossentropy": 2.2983651250600814, |
| "loss/hidden": 3.562890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2763795707374811, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 94.5, |
| "grad_norm_var": 292.75598958333336, |
| "learning_rate": 0.0001, |
| "loss": 8.5971, |
| "loss/crossentropy": 2.3549255669116973, |
| "loss/hidden": 3.55703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.268990096822381, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 83.0, |
| "grad_norm_var": 1.4189153071319926e+18, |
| "learning_rate": 0.0001, |
| "loss": 8.7383, |
| "loss/crossentropy": 2.267159214615822, |
| "loss/hidden": 3.5671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27373309470713136, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 77.5, |
| "grad_norm_var": 63.916666666666664, |
| "learning_rate": 0.0001, |
| "loss": 8.5718, |
| "loss/crossentropy": 2.259125065803528, |
| "loss/hidden": 3.67265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28385352455079554, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 73.5, |
| "grad_norm_var": 39.78723958333333, |
| "learning_rate": 0.0001, |
| "loss": 8.4993, |
| "loss/crossentropy": 2.3606351226568223, |
| "loss/hidden": 3.558984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.272869897633791, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 71.5, |
| "grad_norm_var": 247.2625, |
| "learning_rate": 0.0001, |
| "loss": 8.6188, |
| "loss/crossentropy": 2.394289918243885, |
| "loss/hidden": 3.519921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.269069866463542, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 71.0, |
| "grad_norm_var": 265.7, |
| "learning_rate": 0.0001, |
| "loss": 8.4936, |
| "loss/crossentropy": 2.2599784307181836, |
| "loss/hidden": 3.533203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26600994151085616, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 83.0, |
| "grad_norm_var": 42.88333333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.5015, |
| "loss/crossentropy": 2.3098704159259795, |
| "loss/hidden": 3.628515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.285567194968462, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 67.0, |
| "grad_norm_var": 111.665625, |
| "learning_rate": 0.0001, |
| "loss": 8.4128, |
| "loss/crossentropy": 2.1794722147285936, |
| "loss/hidden": 3.526953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2647275095805526, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 91.0, |
| "grad_norm_var": 149.97890625, |
| "learning_rate": 0.0001, |
| "loss": 8.4745, |
| "loss/crossentropy": 2.2243838563561438, |
| "loss/hidden": 3.550390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2563688028603792, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 90.0, |
| "grad_norm_var": 157.87395833333332, |
| "learning_rate": 0.0001, |
| "loss": 8.4168, |
| "loss/crossentropy": 2.3965038657188416, |
| "loss/hidden": 3.52265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27364722844213246, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 96.0, |
| "grad_norm_var": 380.89348958333335, |
| "learning_rate": 0.0001, |
| "loss": 8.6256, |
| "loss/crossentropy": 2.519009140133858, |
| "loss/hidden": 3.536328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29145103991031646, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 80.0, |
| "grad_norm_var": 331.05, |
| "learning_rate": 0.0001, |
| "loss": 8.2011, |
| "loss/crossentropy": 2.1994084089994432, |
| "loss/hidden": 3.530078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2542119387537241, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 72.0, |
| "grad_norm_var": 41.19583333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.3636, |
| "loss/crossentropy": 2.4333469703793527, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.25861090533435344, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 79.0, |
| "grad_norm_var": 226.29583333333332, |
| "learning_rate": 0.0001, |
| "loss": 8.5285, |
| "loss/crossentropy": 2.468096488714218, |
| "loss/hidden": 3.478515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26285996809601786, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 84.5, |
| "grad_norm_var": 218.12916666666666, |
| "learning_rate": 0.0001, |
| "loss": 8.4346, |
| "loss/crossentropy": 2.2107077345252035, |
| "loss/hidden": 3.591015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2654247496277094, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 68.0, |
| "grad_norm_var": 47.329166666666666, |
| "learning_rate": 0.0001, |
| "loss": 8.4021, |
| "loss/crossentropy": 2.188153588026762, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24756914153695106, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 68.0, |
| "grad_norm_var": 232.240625, |
| "learning_rate": 0.0001, |
| "loss": 8.4491, |
| "loss/crossentropy": 2.3357387453317644, |
| "loss/hidden": 3.552734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28453084602952006, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 63.5, |
| "grad_norm_var": 179.80729166666666, |
| "learning_rate": 0.0001, |
| "loss": 8.4439, |
| "loss/crossentropy": 2.3677712947130205, |
| "loss/hidden": 3.6484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2788976304233074, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 119.5, |
| "grad_norm_var": 398.665625, |
| "learning_rate": 0.0001, |
| "loss": 8.3827, |
| "loss/crossentropy": 2.4275426417589188, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26100732628256085, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 66.5, |
| "grad_norm_var": 209.37395833333332, |
| "learning_rate": 0.0001, |
| "loss": 8.3197, |
| "loss/crossentropy": 2.237619758397341, |
| "loss/hidden": 3.508203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2523366323672235, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 171.0, |
| "grad_norm_var": 636.4833333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.2648, |
| "loss/crossentropy": 2.169030448794365, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24771953662857413, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 68.0, |
| "grad_norm_var": 861.1247395833333, |
| "learning_rate": 0.0001, |
| "loss": 8.2948, |
| "loss/crossentropy": 2.197067990899086, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2451560577377677, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 63.5, |
| "grad_norm_var": 524.7833333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.2316, |
| "loss/crossentropy": 2.2412655726075172, |
| "loss/hidden": 3.498046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26945888753980396, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 90.5, |
| "grad_norm_var": 496.12395833333335, |
| "learning_rate": 0.0001, |
| "loss": 8.3094, |
| "loss/crossentropy": 2.314925655722618, |
| "loss/hidden": 3.59375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27251414209604263, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 70.0, |
| "grad_norm_var": 484.890625, |
| "learning_rate": 0.0001, |
| "loss": 8.3807, |
| "loss/crossentropy": 2.3074424833059313, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2574224047362804, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 69.0, |
| "grad_norm_var": 88.83932291666666, |
| "learning_rate": 0.0001, |
| "loss": 8.3403, |
| "loss/crossentropy": 2.2954701989889146, |
| "loss/hidden": 3.46484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.25794004313647745, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 71.5, |
| "grad_norm_var": 86.65598958333334, |
| "learning_rate": 0.0001, |
| "loss": 8.1745, |
| "loss/crossentropy": 2.2755073979496956, |
| "loss/hidden": 3.521875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26081139910966156, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 73.5, |
| "grad_norm_var": 46.40390625, |
| "learning_rate": 0.0001, |
| "loss": 8.2619, |
| "loss/crossentropy": 2.2126931130886076, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27995246797800066, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 70.0, |
| "grad_norm_var": 44.215625, |
| "learning_rate": 0.0001, |
| "loss": 8.3462, |
| "loss/crossentropy": 2.3120188415050507, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.25568581037223337, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 79.0, |
| "grad_norm_var": 250.965625, |
| "learning_rate": 0.0001, |
| "loss": 8.3991, |
| "loss/crossentropy": 2.2807445406913756, |
| "loss/hidden": 3.4328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2566069485619664, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 72.5, |
| "grad_norm_var": 287.98333333333335, |
| "learning_rate": 0.0001, |
| "loss": 8.2019, |
| "loss/crossentropy": 2.3523808985948564, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2528150577098131, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 85.0, |
| "grad_norm_var": 37.733072916666664, |
| "learning_rate": 0.0001, |
| "loss": 8.1958, |
| "loss/crossentropy": 2.0805646784603598, |
| "loss/hidden": 3.38359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22354185171425342, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 67.5, |
| "grad_norm_var": 72.85729166666667, |
| "learning_rate": 0.0001, |
| "loss": 8.0768, |
| "loss/crossentropy": 2.3133904695510865, |
| "loss/hidden": 3.401171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24347416013479234, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 79.0, |
| "grad_norm_var": 169.65833333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.2994, |
| "loss/crossentropy": 2.3512276649475097, |
| "loss/hidden": 3.444921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26196608748286965, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 69.5, |
| "grad_norm_var": 2388.08515625, |
| "learning_rate": 0.0001, |
| "loss": 8.3424, |
| "loss/crossentropy": 2.3174356922507284, |
| "loss/hidden": 3.454296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2508500372990966, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 60.0, |
| "grad_norm_var": 196.15833333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.256, |
| "loss/crossentropy": 2.280574831366539, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2615037776529789, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 73.0, |
| "grad_norm_var": 116.42890625, |
| "learning_rate": 0.0001, |
| "loss": 8.2108, |
| "loss/crossentropy": 2.275608576834202, |
| "loss/hidden": 3.449609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2556317184120417, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 66.5, |
| "grad_norm_var": 38.723958333333336, |
| "learning_rate": 0.0001, |
| "loss": 8.3584, |
| "loss/crossentropy": 2.356363560259342, |
| "loss/hidden": 3.490625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26850553378462794, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 70.0, |
| "grad_norm_var": 90.62916666666666, |
| "learning_rate": 0.0001, |
| "loss": 8.1875, |
| "loss/crossentropy": 2.282008448243141, |
| "loss/hidden": 3.4234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.25158569142222403, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 69.0, |
| "grad_norm_var": 26.895833333333332, |
| "learning_rate": 0.0001, |
| "loss": 8.1676, |
| "loss/crossentropy": 2.3583726406097414, |
| "loss/hidden": 3.475, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2574294516816735, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 62.25, |
| "grad_norm_var": 34.430989583333336, |
| "learning_rate": 0.0001, |
| "loss": 8.2457, |
| "loss/crossentropy": 2.310526317358017, |
| "loss/hidden": 3.407421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.25894895792007444, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 84.0, |
| "grad_norm_var": 65.58307291666667, |
| "learning_rate": 0.0001, |
| "loss": 8.2176, |
| "loss/crossentropy": 2.0871855318546295, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24589193761348724, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 65.5, |
| "grad_norm_var": 39.07473958333333, |
| "learning_rate": 0.0001, |
| "loss": 8.1842, |
| "loss/crossentropy": 2.261622406542301, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24441927969455718, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 64.0, |
| "grad_norm_var": 57.848958333333336, |
| "learning_rate": 0.0001, |
| "loss": 8.1485, |
| "loss/crossentropy": 2.386093820631504, |
| "loss/hidden": 3.3359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2539959207177162, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 72.5, |
| "grad_norm_var": 37.01223958333333, |
| "learning_rate": 0.0001, |
| "loss": 8.3277, |
| "loss/crossentropy": 2.2825982570648193, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2809562737122178, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 69.0, |
| "grad_norm_var": 15.633333333333333, |
| "learning_rate": 0.0001, |
| "loss": 8.1367, |
| "loss/crossentropy": 2.181477516889572, |
| "loss/hidden": 3.494140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26897694952785967, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 68.5, |
| "grad_norm_var": 13.966666666666667, |
| "learning_rate": 0.0001, |
| "loss": 8.1232, |
| "loss/crossentropy": 2.292652648687363, |
| "loss/hidden": 3.409765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24958589412271975, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 68.0, |
| "grad_norm_var": 88.34765625, |
| "learning_rate": 0.0001, |
| "loss": 8.09, |
| "loss/crossentropy": 2.367698776721954, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2641737159341574, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 91.5, |
| "grad_norm_var": 120.825, |
| "learning_rate": 0.0001, |
| "loss": 8.1587, |
| "loss/crossentropy": 2.3354921892285345, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24634175039827824, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.715020064017613e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|