{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 2000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 452.0, "learning_rate": 1.18e-05, "loss": 102.5405, "loss/crossentropy": 9.402848720550537, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 7.375310897827148, "step": 2 }, { "epoch": 0.00025, "grad_norm": 414.0, "learning_rate": 1.3600000000000002e-05, "loss": 101.664, "loss/crossentropy": 9.609405994415283, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 7.738794326782227, "step": 4 }, { "epoch": 0.000375, "grad_norm": 378.0, "learning_rate": 1.54e-05, "loss": 101.1724, "loss/crossentropy": 9.574851512908936, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 7.784070253372192, "step": 6 }, { "epoch": 0.0005, "grad_norm": 219.0, "learning_rate": 1.72e-05, "loss": 95.902, "loss/crossentropy": 9.218470573425293, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 7.2567784786224365, "step": 8 }, { "epoch": 0.000625, "grad_norm": 176.0, "learning_rate": 1.9e-05, "loss": 92.9676, "loss/crossentropy": 8.523199081420898, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 6.777792453765869, "step": 10 }, { "epoch": 0.00075, "grad_norm": 161.0, "learning_rate": 2.0800000000000004e-05, "loss": 92.2125, "loss/crossentropy": 8.476105213165283, "loss/hidden": 16.25, "loss/jsd": 0.0, "loss/logits": 6.428695201873779, "step": 12 }, { "epoch": 0.000875, "grad_norm": 132.0, "learning_rate": 2.2600000000000004e-05, "loss": 88.1193, "loss/crossentropy": 8.148025274276733, "loss/hidden": 16.25, "loss/jsd": 0.0, "loss/logits": 6.29009485244751, "step": 14 }, { "epoch": 0.001, "grad_norm": 111.5, "grad_norm_var": 16966.165625, "learning_rate": 2.4400000000000004e-05, "loss": 82.802, "loss/crossentropy": 7.800824403762817, "loss/hidden": 16.0625, "loss/jsd": 0.0, "loss/logits": 6.119067668914795, "step": 16 }, { "epoch": 0.001125, "grad_norm": 202.0, "grad_norm_var": 12627.807291666666, "learning_rate": 2.6200000000000003e-05, "loss": 83.659, "loss/crossentropy": 7.971633672714233, "loss/hidden": 15.6875, "loss/jsd": 0.0, "loss/logits": 5.801014423370361, "step": 18 }, { "epoch": 0.00125, "grad_norm": 98.5, "grad_norm_var": 9502.990625, "learning_rate": 2.8000000000000003e-05, "loss": 79.2692, "loss/crossentropy": 7.694047451019287, "loss/hidden": 15.3125, "loss/jsd": 0.0, "loss/logits": 5.926986217498779, "step": 20 }, { "epoch": 0.001375, "grad_norm": 44.75, "grad_norm_var": 4617.248958333334, "learning_rate": 2.9800000000000006e-05, "loss": 75.9179, "loss/crossentropy": 7.279789924621582, "loss/hidden": 15.21875, "loss/jsd": 0.0, "loss/logits": 5.121587514877319, "step": 22 }, { "epoch": 0.0015, "grad_norm": 51.75, "grad_norm_var": 4630.629166666667, "learning_rate": 3.16e-05, "loss": 73.0607, "loss/crossentropy": 7.157355308532715, "loss/hidden": 14.96875, "loss/jsd": 0.0, "loss/logits": 5.0728209018707275, "step": 24 }, { "epoch": 0.001625, "grad_norm": 119.5, "grad_norm_var": 4558.848958333333, "learning_rate": 3.3400000000000005e-05, "loss": 70.6761, "loss/crossentropy": 6.860254287719727, "loss/hidden": 14.90625, "loss/jsd": 0.0, "loss/logits": 4.800362586975098, "step": 26 }, { "epoch": 0.00175, "grad_norm": 70.5, "grad_norm_var": 4260.416666666667, "learning_rate": 3.520000000000001e-05, "loss": 66.2243, "loss/crossentropy": 6.543152332305908, "loss/hidden": 14.625, "loss/jsd": 0.0, "loss/logits": 4.315927267074585, "step": 28 }, { "epoch": 0.001875, "grad_norm": 64.5, "grad_norm_var": 4383.795572916667, "learning_rate": 3.7e-05, "loss": 61.0467, "loss/crossentropy": 6.067888975143433, "loss/hidden": 14.0, "loss/jsd": 0.0, "loss/logits": 4.298548221588135, "step": 30 }, { "epoch": 0.002, "grad_norm": 101.5, "grad_norm_var": 4350.370572916667, "learning_rate": 3.88e-05, "loss": 59.3226, "loss/crossentropy": 5.8317787647247314, "loss/hidden": 13.5625, "loss/jsd": 0.0, "loss/logits": 4.109991788864136, "step": 32 }, { "epoch": 0.002125, "grad_norm": 73.0, "grad_norm_var": 802.3372395833334, "learning_rate": 4.0600000000000004e-05, "loss": 54.6985, "loss/crossentropy": 5.255858659744263, "loss/hidden": 13.40625, "loss/jsd": 0.0, "loss/logits": 3.483602285385132, "step": 34 }, { "epoch": 0.00225, "grad_norm": 60.0, "grad_norm_var": 778.9997395833333, "learning_rate": 4.240000000000001e-05, "loss": 50.0453, "loss/crossentropy": 5.106731653213501, "loss/hidden": 12.9375, "loss/jsd": 0.0, "loss/logits": 3.3304505348205566, "step": 36 }, { "epoch": 0.002375, "grad_norm": 77.5, "grad_norm_var": 710.56015625, "learning_rate": 4.420000000000001e-05, "loss": 46.1392, "loss/crossentropy": 4.786301136016846, "loss/hidden": 12.375, "loss/jsd": 0.0, "loss/logits": 2.817455768585205, "step": 38 }, { "epoch": 0.0025, "grad_norm": 53.5, "grad_norm_var": 682.28515625, "learning_rate": 4.600000000000001e-05, "loss": 41.2416, "loss/crossentropy": 4.526076555252075, "loss/hidden": 11.8125, "loss/jsd": 0.0, "loss/logits": 2.548874258995056, "step": 40 }, { "epoch": 0.002625, "grad_norm": 50.5, "grad_norm_var": 603.9322916666666, "learning_rate": 4.78e-05, "loss": 37.7184, "loss/crossentropy": 4.177120208740234, "loss/hidden": 11.5625, "loss/jsd": 0.0, "loss/logits": 2.2114726305007935, "step": 42 }, { "epoch": 0.00275, "grad_norm": 43.0, "grad_norm_var": 271.6497395833333, "learning_rate": 4.96e-05, "loss": 36.3335, "loss/crossentropy": 3.943485736846924, "loss/hidden": 11.125, "loss/jsd": 0.0, "loss/logits": 2.138027310371399, "step": 44 }, { "epoch": 0.002875, "grad_norm": 39.75, "grad_norm_var": 335.72473958333336, "learning_rate": 5.14e-05, "loss": 33.4218, "loss/crossentropy": 3.7944674491882324, "loss/hidden": 10.6875, "loss/jsd": 0.0, "loss/logits": 1.9265018701553345, "step": 46 }, { "epoch": 0.003, "grad_norm": 31.375, "grad_norm_var": 227.42962239583332, "learning_rate": 5.3200000000000006e-05, "loss": 31.3299, "loss/crossentropy": 3.8822021484375, "loss/hidden": 10.21875, "loss/jsd": 0.0, "loss/logits": 1.8083009719848633, "step": 48 }, { "epoch": 0.003125, "grad_norm": 36.0, "grad_norm_var": 201.03515625, "learning_rate": 5.500000000000001e-05, "loss": 30.3259, "loss/crossentropy": 3.730429768562317, "loss/hidden": 10.0, "loss/jsd": 0.0, "loss/logits": 1.6891510486602783, "step": 50 }, { "epoch": 0.00325, "grad_norm": 27.5, "grad_norm_var": 188.65104166666666, "learning_rate": 5.680000000000001e-05, "loss": 28.9668, "loss/crossentropy": 3.922508955001831, "loss/hidden": 9.78125, "loss/jsd": 0.0, "loss/logits": 1.5780616998672485, "step": 52 }, { "epoch": 0.003375, "grad_norm": 22.75, "grad_norm_var": 134.12682291666667, "learning_rate": 5.860000000000001e-05, "loss": 28.0035, "loss/crossentropy": 3.7553197145462036, "loss/hidden": 9.375, "loss/jsd": 0.0, "loss/logits": 1.5748284459114075, "step": 54 }, { "epoch": 0.0035, "grad_norm": 17.25, "grad_norm_var": 137.21458333333334, "learning_rate": 6.040000000000001e-05, "loss": 26.6777, "loss/crossentropy": 3.3777416944503784, "loss/hidden": 9.25, "loss/jsd": 0.0, "loss/logits": 1.3980408906936646, "step": 56 }, { "epoch": 0.003625, "grad_norm": 25.0, "grad_norm_var": 119.43170572916667, "learning_rate": 6.220000000000001e-05, "loss": 25.1038, "loss/crossentropy": 3.0976059436798096, "loss/hidden": 9.0625, "loss/jsd": 0.0, "loss/logits": 1.2831275463104248, "step": 58 }, { "epoch": 0.00375, "grad_norm": 25.0, "grad_norm_var": 114.703125, "learning_rate": 6.400000000000001e-05, "loss": 25.2283, "loss/crossentropy": 3.511284828186035, "loss/hidden": 9.15625, "loss/jsd": 0.0, "loss/logits": 1.472885012626648, "step": 60 }, { "epoch": 0.003875, "grad_norm": 33.0, "grad_norm_var": 112.51223958333334, "learning_rate": 6.58e-05, "loss": 24.4774, "loss/crossentropy": 3.6399108171463013, "loss/hidden": 8.6875, "loss/jsd": 0.0, "loss/logits": 1.3320220708847046, "step": 62 }, { "epoch": 0.004, "grad_norm": 16.875, "grad_norm_var": 129.7978515625, "learning_rate": 6.76e-05, "loss": 23.5559, "loss/crossentropy": 3.2922180891036987, "loss/hidden": 8.5625, "loss/jsd": 0.0, "loss/logits": 1.225121259689331, "step": 64 }, { "epoch": 0.004125, "grad_norm": 18.625, "grad_norm_var": 130.56875, "learning_rate": 6.94e-05, "loss": 22.8844, "loss/crossentropy": 3.145629405975342, "loss/hidden": 8.3125, "loss/jsd": 0.0, "loss/logits": 1.0847916007041931, "step": 66 }, { "epoch": 0.00425, "grad_norm": 22.25, "grad_norm_var": 284.284375, "learning_rate": 7.120000000000001e-05, "loss": 22.1537, "loss/crossentropy": 3.037510633468628, "loss/hidden": 8.171875, "loss/jsd": 0.0, "loss/logits": 1.051486313343048, "step": 68 }, { "epoch": 0.004375, "grad_norm": 15.9375, "grad_norm_var": 204.964697265625, "learning_rate": 7.3e-05, "loss": 21.0354, "loss/crossentropy": 2.841916799545288, "loss/hidden": 7.921875, "loss/jsd": 0.0, "loss/logits": 0.9865279197692871, "step": 70 }, { "epoch": 0.0045, "grad_norm": 13.125, "grad_norm_var": 210.74607747395834, "learning_rate": 7.48e-05, "loss": 20.6134, "loss/crossentropy": 2.9782702922821045, "loss/hidden": 7.84375, "loss/jsd": 0.0, "loss/logits": 1.009881168603897, "step": 72 }, { "epoch": 0.004625, "grad_norm": 19.75, "grad_norm_var": 218.80983072916666, "learning_rate": 7.66e-05, "loss": 20.7128, "loss/crossentropy": 2.9653278589248657, "loss/hidden": 7.5625, "loss/jsd": 0.0, "loss/logits": 1.012134611606598, "step": 74 }, { "epoch": 0.00475, "grad_norm": 16.25, "grad_norm_var": 220.57526041666668, "learning_rate": 7.840000000000001e-05, "loss": 19.461, "loss/crossentropy": 2.935641646385193, "loss/hidden": 7.5625, "loss/jsd": 0.0, "loss/logits": 0.9427644312381744, "step": 76 }, { "epoch": 0.004875, "grad_norm": 17.375, "grad_norm_var": 219.2650390625, "learning_rate": 8.020000000000001e-05, "loss": 19.1704, "loss/crossentropy": 2.723261594772339, "loss/hidden": 7.3125, "loss/jsd": 0.0, "loss/logits": 0.8925547003746033, "step": 78 }, { "epoch": 0.005, "grad_norm": 44.25, "grad_norm_var": 250.6109375, "learning_rate": 8.200000000000001e-05, "loss": 19.3476, "loss/crossentropy": 3.0457217693328857, "loss/hidden": 7.265625, "loss/jsd": 0.0, "loss/logits": 0.9127894341945648, "step": 80 }, { "epoch": 0.005125, "grad_norm": 18.25, "grad_norm_var": 251.53697916666667, "learning_rate": 8.38e-05, "loss": 19.5412, "loss/crossentropy": 2.981551766395569, "loss/hidden": 7.3125, "loss/jsd": 0.0, "loss/logits": 0.9058282673358917, "step": 82 }, { "epoch": 0.00525, "grad_norm": 13.6875, "grad_norm_var": 53.40358072916667, "learning_rate": 8.560000000000001e-05, "loss": 18.4877, "loss/crossentropy": 2.923082709312439, "loss/hidden": 7.234375, "loss/jsd": 0.0, "loss/logits": 0.9712814092636108, "step": 84 }, { "epoch": 0.005375, "grad_norm": 16.625, "grad_norm_var": 55.94166666666667, "learning_rate": 8.740000000000001e-05, "loss": 18.5095, "loss/crossentropy": 2.6508177518844604, "loss/hidden": 7.140625, "loss/jsd": 0.0, "loss/logits": 0.7856882810592651, "step": 86 }, { "epoch": 0.0055, "grad_norm": 18.25, "grad_norm_var": 55.864697265625, "learning_rate": 8.92e-05, "loss": 18.6377, "loss/crossentropy": 3.1421643495559692, "loss/hidden": 7.015625, "loss/jsd": 0.0, "loss/logits": 0.8637695610523224, "step": 88 }, { "epoch": 0.005625, "grad_norm": 15.125, "grad_norm_var": 55.58743489583333, "learning_rate": 9.1e-05, "loss": 18.4141, "loss/crossentropy": 2.9101529121398926, "loss/hidden": 6.828125, "loss/jsd": 0.0, "loss/logits": 0.871369868516922, "step": 90 }, { "epoch": 0.00575, "grad_norm": 14.5625, "grad_norm_var": 55.61756184895833, "learning_rate": 9.28e-05, "loss": 17.595, "loss/crossentropy": 2.702631711959839, "loss/hidden": 6.640625, "loss/jsd": 0.0, "loss/logits": 0.794559121131897, "step": 92 }, { "epoch": 0.005875, "grad_norm": 18.875, "grad_norm_var": 56.09021809895833, "learning_rate": 9.46e-05, "loss": 18.1246, "loss/crossentropy": 2.627371072769165, "loss/hidden": 6.8125, "loss/jsd": 0.0, "loss/logits": 0.8269491195678711, "step": 94 }, { "epoch": 0.006, "grad_norm": 16.625, "grad_norm_var": 5.282405598958333, "learning_rate": 9.64e-05, "loss": 18.1242, "loss/crossentropy": 2.9903202056884766, "loss/hidden": 6.6875, "loss/jsd": 0.0, "loss/logits": 0.9215152859687805, "step": 96 }, { "epoch": 0.006125, "grad_norm": 12.375, "grad_norm_var": 4.96171875, "learning_rate": 9.82e-05, "loss": 17.2435, "loss/crossentropy": 2.599429130554199, "loss/hidden": 6.734375, "loss/jsd": 0.0, "loss/logits": 0.7650539577007294, "step": 98 }, { "epoch": 0.00625, "grad_norm": 14.75, "grad_norm_var": 5.29375, "learning_rate": 0.0001, "loss": 17.4062, "loss/crossentropy": 2.7459075450897217, "loss/hidden": 6.671875, "loss/jsd": 0.0, "loss/logits": 0.8454415798187256, "step": 100 }, { "epoch": 0.006375, "grad_norm": 12.9375, "grad_norm_var": 4.713655598958334, "learning_rate": 0.0001, "loss": 16.8337, "loss/crossentropy": 2.598941206932068, "loss/hidden": 6.75, "loss/jsd": 0.0, "loss/logits": 0.7346194386482239, "step": 102 }, { "epoch": 0.0065, "grad_norm": 11.8125, "grad_norm_var": 4.705192057291667, "learning_rate": 0.0001, "loss": 16.8817, "loss/crossentropy": 2.864794969558716, "loss/hidden": 6.46875, "loss/jsd": 0.0, "loss/logits": 0.758391797542572, "step": 104 }, { "epoch": 0.006625, "grad_norm": 13.875, "grad_norm_var": 5.592431640625, "learning_rate": 0.0001, "loss": 16.2693, "loss/crossentropy": 2.505234479904175, "loss/hidden": 6.375, "loss/jsd": 0.0, "loss/logits": 0.6830147504806519, "step": 106 }, { "epoch": 0.00675, "grad_norm": 10.375, "grad_norm_var": 6.703059895833333, "learning_rate": 0.0001, "loss": 16.2695, "loss/crossentropy": 2.5332175493240356, "loss/hidden": 6.390625, "loss/jsd": 0.0, "loss/logits": 0.6794486939907074, "step": 108 }, { "epoch": 0.006875, "grad_norm": 12.9375, "grad_norm_var": 5.280452473958333, "learning_rate": 0.0001, "loss": 16.2777, "loss/crossentropy": 2.5483860969543457, "loss/hidden": 6.375, "loss/jsd": 0.0, "loss/logits": 0.7257988452911377, "step": 110 }, { "epoch": 0.007, "grad_norm": 10.0, "grad_norm_var": 4.361442057291667, "learning_rate": 0.0001, "loss": 16.4055, "loss/crossentropy": 2.7951642274856567, "loss/hidden": 6.3125, "loss/jsd": 0.0, "loss/logits": 0.6555584371089935, "step": 112 }, { "epoch": 0.007125, "grad_norm": 12.1875, "grad_norm_var": 5.310791015625, "learning_rate": 0.0001, "loss": 15.8454, "loss/crossentropy": 2.729629158973694, "loss/hidden": 6.375, "loss/jsd": 0.0, "loss/logits": 0.6842410564422607, "step": 114 }, { "epoch": 0.00725, "grad_norm": 13.125, "grad_norm_var": 3.295768229166667, "learning_rate": 0.0001, "loss": 16.0073, "loss/crossentropy": 2.483906865119934, "loss/hidden": 6.453125, "loss/jsd": 0.0, "loss/logits": 0.7329719960689545, "step": 116 }, { "epoch": 0.007375, "grad_norm": 10.6875, "grad_norm_var": 3.2458170572916667, "learning_rate": 0.0001, "loss": 15.7562, "loss/crossentropy": 2.414050340652466, "loss/hidden": 6.296875, "loss/jsd": 0.0, "loss/logits": 0.6825916767120361, "step": 118 }, { "epoch": 0.0075, "grad_norm": 14.8125, "grad_norm_var": 2.760872395833333, "learning_rate": 0.0001, "loss": 16.2831, "loss/crossentropy": 2.6738442182540894, "loss/hidden": 6.28125, "loss/jsd": 0.0, "loss/logits": 0.7312073111534119, "step": 120 }, { "epoch": 0.007625, "grad_norm": 24.625, "grad_norm_var": 15.019270833333334, "learning_rate": 0.0001, "loss": 15.6956, "loss/crossentropy": 2.501539945602417, "loss/hidden": 6.03125, "loss/jsd": 0.0, "loss/logits": 0.668183445930481, "step": 122 }, { "epoch": 0.00775, "grad_norm": 48.5, "grad_norm_var": 92.15467122395833, "learning_rate": 0.0001, "loss": 16.7872, "loss/crossentropy": 2.6723363399505615, "loss/hidden": 6.109375, "loss/jsd": 0.0, "loss/logits": 0.6724417805671692, "step": 124 }, { "epoch": 0.007875, "grad_norm": 9.4375, "grad_norm_var": 97.30597330729167, "learning_rate": 0.0001, "loss": 16.5174, "loss/crossentropy": 2.8790372610092163, "loss/hidden": 6.28125, "loss/jsd": 0.0, "loss/logits": 0.7065887451171875, "step": 126 }, { "epoch": 0.008, "grad_norm": 14.3125, "grad_norm_var": 94.71399739583333, "learning_rate": 0.0001, "loss": 16.0204, "loss/crossentropy": 2.552863121032715, "loss/hidden": 6.203125, "loss/jsd": 0.0, "loss/logits": 0.7075692117214203, "step": 128 }, { "epoch": 0.008125, "grad_norm": 14.125, "grad_norm_var": 92.49375, "learning_rate": 0.0001, "loss": 15.5941, "loss/crossentropy": 2.8637495040893555, "loss/hidden": 5.953125, "loss/jsd": 0.0, "loss/logits": 0.7078527808189392, "step": 130 }, { "epoch": 0.00825, "grad_norm": 18.25, "grad_norm_var": 92.94583333333334, "learning_rate": 0.0001, "loss": 15.8905, "loss/crossentropy": 2.6780364513397217, "loss/hidden": 6.296875, "loss/jsd": 0.0, "loss/logits": 0.7987034320831299, "step": 132 }, { "epoch": 0.008375, "grad_norm": 13.875, "grad_norm_var": 89.76608072916666, "learning_rate": 0.0001, "loss": 15.1033, "loss/crossentropy": 2.5902607440948486, "loss/hidden": 6.1875, "loss/jsd": 0.0, "loss/logits": 0.6669524908065796, "step": 134 }, { "epoch": 0.0085, "grad_norm": 10.4375, "grad_norm_var": 90.15182291666666, "learning_rate": 0.0001, "loss": 15.787, "loss/crossentropy": 2.6096177101135254, "loss/hidden": 6.25, "loss/jsd": 0.0, "loss/logits": 0.666314572095871, "step": 136 }, { "epoch": 0.008625, "grad_norm": 9.4375, "grad_norm_var": 88.796337890625, "learning_rate": 0.0001, "loss": 15.6164, "loss/crossentropy": 2.6824041604995728, "loss/hidden": 5.8125, "loss/jsd": 0.0, "loss/logits": 0.649236261844635, "step": 138 }, { "epoch": 0.00875, "grad_norm": 9.875, "grad_norm_var": 14.524934895833333, "learning_rate": 0.0001, "loss": 15.0082, "loss/crossentropy": 2.388152241706848, "loss/hidden": 5.84375, "loss/jsd": 0.0, "loss/logits": 0.5979329943656921, "step": 140 }, { "epoch": 0.008875, "grad_norm": 13.125, "grad_norm_var": 7.161458333333333, "learning_rate": 0.0001, "loss": 15.6953, "loss/crossentropy": 2.940942645072937, "loss/hidden": 5.8125, "loss/jsd": 0.0, "loss/logits": 0.6629509031772614, "step": 142 }, { "epoch": 0.009, "grad_norm": 11.625, "grad_norm_var": 7.346809895833333, "learning_rate": 0.0001, "loss": 15.3134, "loss/crossentropy": 2.876917243003845, "loss/hidden": 5.96875, "loss/jsd": 0.0, "loss/logits": 0.7208061516284943, "step": 144 }, { "epoch": 0.009125, "grad_norm": 11.0625, "grad_norm_var": 7.156363932291667, "learning_rate": 0.0001, "loss": 15.0271, "loss/crossentropy": 2.891162395477295, "loss/hidden": 5.8125, "loss/jsd": 0.0, "loss/logits": 0.6949471533298492, "step": 146 }, { "epoch": 0.00925, "grad_norm": 12.125, "grad_norm_var": 4.721875, "learning_rate": 0.0001, "loss": 15.2735, "loss/crossentropy": 2.5521273612976074, "loss/hidden": 5.703125, "loss/jsd": 0.0, "loss/logits": 0.6166436970233917, "step": 148 }, { "epoch": 0.009375, "grad_norm": 10.3125, "grad_norm_var": 4.923291015625, "learning_rate": 0.0001, "loss": 14.7093, "loss/crossentropy": 2.9473615884780884, "loss/hidden": 5.78125, "loss/jsd": 0.0, "loss/logits": 0.6036444008350372, "step": 150 }, { "epoch": 0.0095, "grad_norm": 10.375, "grad_norm_var": 5.0603515625, "learning_rate": 0.0001, "loss": 14.546, "loss/crossentropy": 2.633287191390991, "loss/hidden": 5.96875, "loss/jsd": 0.0, "loss/logits": 0.6578050553798676, "step": 152 }, { "epoch": 0.009625, "grad_norm": 10.1875, "grad_norm_var": 2.786181640625, "learning_rate": 0.0001, "loss": 14.2213, "loss/crossentropy": 2.5491374731063843, "loss/hidden": 5.65625, "loss/jsd": 0.0, "loss/logits": 0.5768270492553711, "step": 154 }, { "epoch": 0.00975, "grad_norm": 9.3125, "grad_norm_var": 3.5853515625, "learning_rate": 0.0001, "loss": 14.7582, "loss/crossentropy": 2.5789016485214233, "loss/hidden": 5.796875, "loss/jsd": 0.0, "loss/logits": 0.6302525699138641, "step": 156 }, { "epoch": 0.009875, "grad_norm": 13.75, "grad_norm_var": 4.246354166666666, "learning_rate": 0.0001, "loss": 15.6996, "loss/crossentropy": 2.616266369819641, "loss/hidden": 6.015625, "loss/jsd": 0.0, "loss/logits": 0.766193151473999, "step": 158 }, { "epoch": 0.01, "grad_norm": 9.6875, "grad_norm_var": 4.795166015625, "learning_rate": 0.0001, "loss": 14.4748, "loss/crossentropy": 2.5336798429489136, "loss/hidden": 5.65625, "loss/jsd": 0.0, "loss/logits": 0.569669634103775, "step": 160 }, { "epoch": 0.010125, "grad_norm": 9.25, "grad_norm_var": 4.912613932291666, "learning_rate": 0.0001, "loss": 14.1687, "loss/crossentropy": 2.669781446456909, "loss/hidden": 5.578125, "loss/jsd": 0.0, "loss/logits": 0.5986030101776123, "step": 162 }, { "epoch": 0.01025, "grad_norm": 7.28125, "grad_norm_var": 5.49693603515625, "learning_rate": 0.0001, "loss": 13.5945, "loss/crossentropy": 2.4270399808883667, "loss/hidden": 5.578125, "loss/jsd": 0.0, "loss/logits": 0.5576302111148834, "step": 164 }, { "epoch": 0.010375, "grad_norm": 7.8125, "grad_norm_var": 5.84302978515625, "learning_rate": 0.0001, "loss": 13.7014, "loss/crossentropy": 2.49991238117218, "loss/hidden": 5.515625, "loss/jsd": 0.0, "loss/logits": 0.5413426458835602, "step": 166 }, { "epoch": 0.0105, "grad_norm": 9.9375, "grad_norm_var": 6.25494384765625, "learning_rate": 0.0001, "loss": 14.0964, "loss/crossentropy": 2.603005290031433, "loss/hidden": 5.59375, "loss/jsd": 0.0, "loss/logits": 0.5457389950752258, "step": 168 }, { "epoch": 0.010625, "grad_norm": 8.6875, "grad_norm_var": 4.14781494140625, "learning_rate": 0.0001, "loss": 13.7962, "loss/crossentropy": 2.638397216796875, "loss/hidden": 5.546875, "loss/jsd": 0.0, "loss/logits": 0.5720354318618774, "step": 170 }, { "epoch": 0.01075, "grad_norm": 9.0, "grad_norm_var": 4.098140462239583, "learning_rate": 0.0001, "loss": 13.9521, "loss/crossentropy": 2.6292617321014404, "loss/hidden": 5.484375, "loss/jsd": 0.0, "loss/logits": 0.5717212557792664, "step": 172 }, { "epoch": 0.010875, "grad_norm": 8.625, "grad_norm_var": 1.6113240559895834, "learning_rate": 0.0001, "loss": 13.4986, "loss/crossentropy": 2.518304467201233, "loss/hidden": 5.46875, "loss/jsd": 0.0, "loss/logits": 0.536159098148346, "step": 174 }, { "epoch": 0.011, "grad_norm": 7.65625, "grad_norm_var": 1.6873046875, "learning_rate": 0.0001, "loss": 13.3989, "loss/crossentropy": 2.2125723361968994, "loss/hidden": 5.421875, "loss/jsd": 0.0, "loss/logits": 0.5268353521823883, "step": 176 }, { "epoch": 0.011125, "grad_norm": 12.0625, "grad_norm_var": 2.527197265625, "learning_rate": 0.0001, "loss": 13.4924, "loss/crossentropy": 2.706782341003418, "loss/hidden": 5.515625, "loss/jsd": 0.0, "loss/logits": 0.550674319267273, "step": 178 }, { "epoch": 0.01125, "grad_norm": 8.5, "grad_norm_var": 2.1162394205729167, "learning_rate": 0.0001, "loss": 14.1862, "loss/crossentropy": 2.4378350973129272, "loss/hidden": 5.4375, "loss/jsd": 0.0, "loss/logits": 0.5222770571708679, "step": 180 }, { "epoch": 0.011375, "grad_norm": 12.0625, "grad_norm_var": 2.663505045572917, "learning_rate": 0.0001, "loss": 13.7043, "loss/crossentropy": 2.354865074157715, "loss/hidden": 5.3125, "loss/jsd": 0.0, "loss/logits": 0.5132526755332947, "step": 182 }, { "epoch": 0.0115, "grad_norm": 8.75, "grad_norm_var": 2.654150390625, "learning_rate": 0.0001, "loss": 13.2297, "loss/crossentropy": 2.359380006790161, "loss/hidden": 5.359375, "loss/jsd": 0.0, "loss/logits": 0.5386352837085724, "step": 184 }, { "epoch": 0.011625, "grad_norm": 10.5625, "grad_norm_var": 10.096077473958333, "learning_rate": 0.0001, "loss": 13.756, "loss/crossentropy": 2.416779041290283, "loss/hidden": 5.453125, "loss/jsd": 0.0, "loss/logits": 0.5563015639781952, "step": 186 }, { "epoch": 0.01175, "grad_norm": 8.125, "grad_norm_var": 10.41636962890625, "learning_rate": 0.0001, "loss": 13.1918, "loss/crossentropy": 2.453311324119568, "loss/hidden": 5.421875, "loss/jsd": 0.0, "loss/logits": 0.5705090761184692, "step": 188 }, { "epoch": 0.011875, "grad_norm": 8.5625, "grad_norm_var": 10.80523681640625, "learning_rate": 0.0001, "loss": 13.2993, "loss/crossentropy": 2.511751651763916, "loss/hidden": 5.453125, "loss/jsd": 0.0, "loss/logits": 0.5740969777107239, "step": 190 }, { "epoch": 0.012, "grad_norm": 12.75, "grad_norm_var": 236.95362955729166, "learning_rate": 0.0001, "loss": 16.3669, "loss/crossentropy": 2.3405266404151917, "loss/hidden": 5.40625, "loss/jsd": 0.0, "loss/logits": 0.5241988003253937, "step": 192 }, { "epoch": 0.012125, "grad_norm": 6.8125, "grad_norm_var": 238.78566080729166, "learning_rate": 0.0001, "loss": 12.993, "loss/crossentropy": 2.4006471633911133, "loss/hidden": 5.390625, "loss/jsd": 0.0, "loss/logits": 0.5241972208023071, "step": 194 }, { "epoch": 0.01225, "grad_norm": 10.25, "grad_norm_var": 239.61656494140624, "learning_rate": 0.0001, "loss": 13.5281, "loss/crossentropy": 2.9691383838653564, "loss/hidden": 5.421875, "loss/jsd": 0.0, "loss/logits": 0.6251322627067566, "step": 196 }, { "epoch": 0.012375, "grad_norm": 6.71875, "grad_norm_var": 241.750634765625, "learning_rate": 0.0001, "loss": 13.129, "loss/crossentropy": 2.6778980493545532, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5201562494039536, "step": 198 }, { "epoch": 0.0125, "grad_norm": 7.21875, "grad_norm_var": 241.38058268229167, "learning_rate": 0.0001, "loss": 13.8708, "loss/crossentropy": 2.463951349258423, "loss/hidden": 5.328125, "loss/jsd": 0.0, "loss/logits": 0.4814695119857788, "step": 200 }, { "epoch": 0.012625, "grad_norm": 7.5625, "grad_norm_var": 240.40026041666667, "learning_rate": 0.0001, "loss": 12.9584, "loss/crossentropy": 2.425334930419922, "loss/hidden": 5.578125, "loss/jsd": 0.0, "loss/logits": 0.5459253787994385, "step": 202 }, { "epoch": 0.01275, "grad_norm": 7.25, "grad_norm_var": 241.58196614583332, "learning_rate": 0.0001, "loss": 12.9405, "loss/crossentropy": 2.5089434385299683, "loss/hidden": 5.140625, "loss/jsd": 0.0, "loss/logits": 0.4824221730232239, "step": 204 }, { "epoch": 0.012875, "grad_norm": 8.375, "grad_norm_var": 239.54192708333332, "learning_rate": 0.0001, "loss": 12.9376, "loss/crossentropy": 2.2583595514297485, "loss/hidden": 5.09375, "loss/jsd": 0.0, "loss/logits": 0.514517217874527, "step": 206 }, { "epoch": 0.013, "grad_norm": 7.21875, "grad_norm_var": 1.9207967122395833, "learning_rate": 0.0001, "loss": 12.988, "loss/crossentropy": 2.2862155437469482, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.48645374178886414, "step": 208 }, { "epoch": 0.013125, "grad_norm": 8.75, "grad_norm_var": 2.44664306640625, "learning_rate": 0.0001, "loss": 13.3912, "loss/crossentropy": 2.437165856361389, "loss/hidden": 5.703125, "loss/jsd": 0.0, "loss/logits": 0.5492190718650818, "step": 210 }, { "epoch": 0.01325, "grad_norm": 6.96875, "grad_norm_var": 2.31236572265625, "learning_rate": 0.0001, "loss": 12.8591, "loss/crossentropy": 2.2407296895980835, "loss/hidden": 5.234375, "loss/jsd": 0.0, "loss/logits": 0.5724562406539917, "step": 212 }, { "epoch": 0.013375, "grad_norm": 7.40625, "grad_norm_var": 2.2282389322916667, "learning_rate": 0.0001, "loss": 12.9599, "loss/crossentropy": 2.4113292694091797, "loss/hidden": 5.34375, "loss/jsd": 0.0, "loss/logits": 0.5324889719486237, "step": 214 }, { "epoch": 0.0135, "grad_norm": 10.5625, "grad_norm_var": 2.3643513997395833, "learning_rate": 0.0001, "loss": 13.1707, "loss/crossentropy": 2.7635679244995117, "loss/hidden": 5.296875, "loss/jsd": 0.0, "loss/logits": 0.6057597100734711, "step": 216 }, { "epoch": 0.013625, "grad_norm": 8.6875, "grad_norm_var": 2.3481404622395834, "learning_rate": 0.0001, "loss": 12.8911, "loss/crossentropy": 2.390444755554199, "loss/hidden": 5.34375, "loss/jsd": 0.0, "loss/logits": 0.5059748589992523, "step": 218 }, { "epoch": 0.01375, "grad_norm": 8.1875, "grad_norm_var": 2.2968098958333334, "learning_rate": 0.0001, "loss": 12.8851, "loss/crossentropy": 2.631130814552307, "loss/hidden": 5.125, "loss/jsd": 0.0, "loss/logits": 0.5047749727964401, "step": 220 }, { "epoch": 0.013875, "grad_norm": 7.03125, "grad_norm_var": 1.7372355143229166, "learning_rate": 0.0001, "loss": 12.8313, "loss/crossentropy": 2.4646248817443848, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.47171956300735474, "step": 222 }, { "epoch": 0.014, "grad_norm": 6.75, "grad_norm_var": 1.8314453125, "learning_rate": 0.0001, "loss": 12.8172, "loss/crossentropy": 2.8190932273864746, "loss/hidden": 5.171875, "loss/jsd": 0.0, "loss/logits": 0.535597488284111, "step": 224 }, { "epoch": 0.014125, "grad_norm": 6.75, "grad_norm_var": 1.040087890625, "learning_rate": 0.0001, "loss": 12.6812, "loss/crossentropy": 2.2833110094070435, "loss/hidden": 5.0625, "loss/jsd": 0.0, "loss/logits": 0.4708975851535797, "step": 226 }, { "epoch": 0.01425, "grad_norm": 7.03125, "grad_norm_var": 2.4791015625, "learning_rate": 0.0001, "loss": 12.4668, "loss/crossentropy": 2.3643693923950195, "loss/hidden": 5.0, "loss/jsd": 0.0, "loss/logits": 0.4921976178884506, "step": 228 }, { "epoch": 0.014375, "grad_norm": 7.5625, "grad_norm_var": 2.474348958333333, "learning_rate": 0.0001, "loss": 12.8307, "loss/crossentropy": 2.6301621198654175, "loss/hidden": 5.21875, "loss/jsd": 0.0, "loss/logits": 0.5107888281345367, "step": 230 }, { "epoch": 0.0145, "grad_norm": 10.75, "grad_norm_var": 2.551416015625, "learning_rate": 0.0001, "loss": 13.5689, "loss/crossentropy": 2.4100793600082397, "loss/hidden": 5.3125, "loss/jsd": 0.0, "loss/logits": 0.5570683926343918, "step": 232 }, { "epoch": 0.014625, "grad_norm": 8.625, "grad_norm_var": 2.5118123372395833, "learning_rate": 0.0001, "loss": 12.8929, "loss/crossentropy": 2.2406667470932007, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.4651384800672531, "step": 234 }, { "epoch": 0.01475, "grad_norm": 5.5625, "grad_norm_var": 2.9480305989583333, "learning_rate": 0.0001, "loss": 12.5237, "loss/crossentropy": 2.4660996198654175, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.48331503570079803, "step": 236 }, { "epoch": 0.014875, "grad_norm": 9.6875, "grad_norm_var": 3.067822265625, "learning_rate": 0.0001, "loss": 12.5372, "loss/crossentropy": 2.574433207511902, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.5056887269020081, "step": 238 }, { "epoch": 0.015, "grad_norm": 9.125, "grad_norm_var": 4.831233723958333, "learning_rate": 0.0001, "loss": 13.1554, "loss/crossentropy": 2.6438863277435303, "loss/hidden": 5.21875, "loss/jsd": 0.0, "loss/logits": 0.5189574062824249, "step": 240 }, { "epoch": 0.015125, "grad_norm": 11.875, "grad_norm_var": 5.3673828125, "learning_rate": 0.0001, "loss": 13.0263, "loss/crossentropy": 2.455929160118103, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.5100338459014893, "step": 242 }, { "epoch": 0.01525, "grad_norm": 7.84375, "grad_norm_var": 4.651285807291667, "learning_rate": 0.0001, "loss": 12.6428, "loss/crossentropy": 2.188807249069214, "loss/hidden": 5.59375, "loss/jsd": 0.0, "loss/logits": 0.453563928604126, "step": 244 }, { "epoch": 0.015375, "grad_norm": 6.03125, "grad_norm_var": 5.185933430989583, "learning_rate": 0.0001, "loss": 12.2364, "loss/crossentropy": 2.2295005321502686, "loss/hidden": 5.0, "loss/jsd": 0.0, "loss/logits": 0.4845456928014755, "step": 246 }, { "epoch": 0.0155, "grad_norm": 8.8125, "grad_norm_var": 11.094136555989584, "learning_rate": 0.0001, "loss": 13.2367, "loss/crossentropy": 2.5928804874420166, "loss/hidden": 5.234375, "loss/jsd": 0.0, "loss/logits": 0.5228414535522461, "step": 248 }, { "epoch": 0.015625, "grad_norm": 7.1875, "grad_norm_var": 11.339176432291667, "learning_rate": 0.0001, "loss": 12.6461, "loss/crossentropy": 2.5913355350494385, "loss/hidden": 5.0, "loss/jsd": 0.0, "loss/logits": 0.5078196376562119, "step": 250 }, { "epoch": 0.01575, "grad_norm": 8.1875, "grad_norm_var": 10.59488525390625, "learning_rate": 0.0001, "loss": 12.1595, "loss/crossentropy": 2.487132430076599, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.47851574420928955, "step": 252 }, { "epoch": 0.015875, "grad_norm": 6.34375, "grad_norm_var": 11.239449055989583, "learning_rate": 0.0001, "loss": 12.188, "loss/crossentropy": 2.2202062606811523, "loss/hidden": 5.015625, "loss/jsd": 0.0, "loss/logits": 0.4945056736469269, "step": 254 }, { "epoch": 0.016, "grad_norm": 8.4375, "grad_norm_var": 9.29049072265625, "learning_rate": 0.0001, "loss": 12.6777, "loss/crossentropy": 2.128947138786316, "loss/hidden": 4.984375, "loss/jsd": 0.0, "loss/logits": 0.4598337262868881, "step": 256 }, { "epoch": 0.016125, "grad_norm": 6.90625, "grad_norm_var": 8.3, "learning_rate": 0.0001, "loss": 11.9039, "loss/crossentropy": 2.46348774433136, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.4829978346824646, "step": 258 }, { "epoch": 0.01625, "grad_norm": 6.46875, "grad_norm_var": 8.34537353515625, "learning_rate": 0.0001, "loss": 12.1409, "loss/crossentropy": 2.4603699445724487, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.46140412986278534, "step": 260 }, { "epoch": 0.016375, "grad_norm": 7.875, "grad_norm_var": 8.15670166015625, "learning_rate": 0.0001, "loss": 12.2392, "loss/crossentropy": 2.6464070081710815, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.484171599149704, "step": 262 }, { "epoch": 0.0165, "grad_norm": 7.65625, "grad_norm_var": 0.6227701822916667, "learning_rate": 0.0001, "loss": 12.394, "loss/crossentropy": 2.5616872310638428, "loss/hidden": 4.984375, "loss/jsd": 0.0, "loss/logits": 0.486949160695076, "step": 264 }, { "epoch": 0.016625, "grad_norm": 6.8125, "grad_norm_var": 0.6332316080729167, "learning_rate": 0.0001, "loss": 12.8669, "loss/crossentropy": 2.5383331775665283, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.4964377284049988, "step": 266 }, { "epoch": 0.01675, "grad_norm": 7.40625, "grad_norm_var": 0.55953369140625, "learning_rate": 0.0001, "loss": 12.4147, "loss/crossentropy": 2.6105215549468994, "loss/hidden": 4.828125, "loss/jsd": 0.0, "loss/logits": 0.4983997344970703, "step": 268 }, { "epoch": 0.016875, "grad_norm": 6.53125, "grad_norm_var": 14.160542805989584, "learning_rate": 0.0001, "loss": 11.9439, "loss/crossentropy": 2.5301018953323364, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.47874002158641815, "step": 270 }, { "epoch": 0.017, "grad_norm": 12.5, "grad_norm_var": 15.448828125, "learning_rate": 0.0001, "loss": 12.4739, "loss/crossentropy": 2.7044990062713623, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.4878784120082855, "step": 272 }, { "epoch": 0.017125, "grad_norm": 5.8125, "grad_norm_var": 15.85718994140625, "learning_rate": 0.0001, "loss": 12.2921, "loss/crossentropy": 2.4374502897262573, "loss/hidden": 4.84375, "loss/jsd": 0.0, "loss/logits": 0.478779673576355, "step": 274 }, { "epoch": 0.01725, "grad_norm": 12.3125, "grad_norm_var": 16.76685791015625, "learning_rate": 0.0001, "loss": 12.6691, "loss/crossentropy": 2.630133867263794, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.5231852233409882, "step": 276 }, { "epoch": 0.017375, "grad_norm": 7.5, "grad_norm_var": 17.043994140625, "learning_rate": 0.0001, "loss": 12.6767, "loss/crossentropy": 2.273444890975952, "loss/hidden": 4.828125, "loss/jsd": 0.0, "loss/logits": 0.4809526354074478, "step": 278 }, { "epoch": 0.0175, "grad_norm": 6.1875, "grad_norm_var": 17.066923014322917, "learning_rate": 0.0001, "loss": 12.678, "loss/crossentropy": 2.6627763509750366, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.491416797041893, "step": 280 }, { "epoch": 0.017625, "grad_norm": 10.5, "grad_norm_var": 16.721858723958334, "learning_rate": 0.0001, "loss": 12.4923, "loss/crossentropy": 2.500901937484741, "loss/hidden": 4.859375, "loss/jsd": 0.0, "loss/logits": 0.4861127436161041, "step": 282 }, { "epoch": 0.01775, "grad_norm": 7.21875, "grad_norm_var": 16.683854166666666, "learning_rate": 0.0001, "loss": 12.1789, "loss/crossentropy": 2.5762306451797485, "loss/hidden": 4.953125, "loss/jsd": 0.0, "loss/logits": 0.4884538948535919, "step": 284 }, { "epoch": 0.017875, "grad_norm": 7.15625, "grad_norm_var": 5.259375, "learning_rate": 0.0001, "loss": 12.1434, "loss/crossentropy": 2.223027467727661, "loss/hidden": 4.984375, "loss/jsd": 0.0, "loss/logits": 0.4477083534002304, "step": 286 }, { "epoch": 0.018, "grad_norm": 7.3125, "grad_norm_var": 4.191206868489584, "learning_rate": 0.0001, "loss": 12.5347, "loss/crossentropy": 2.4200878143310547, "loss/hidden": 4.875, "loss/jsd": 0.0, "loss/logits": 0.5196676105260849, "step": 288 }, { "epoch": 0.018125, "grad_norm": 8.375, "grad_norm_var": 3.7400390625, "learning_rate": 0.0001, "loss": 12.3831, "loss/crossentropy": 2.7823903560638428, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.5086425989866257, "step": 290 }, { "epoch": 0.01825, "grad_norm": 6.4375, "grad_norm_var": 2.6685831705729166, "learning_rate": 0.0001, "loss": 12.2231, "loss/crossentropy": 2.489367365837097, "loss/hidden": 4.828125, "loss/jsd": 0.0, "loss/logits": 0.5062971413135529, "step": 292 }, { "epoch": 0.018375, "grad_norm": 6.96875, "grad_norm_var": 1.9553385416666667, "learning_rate": 0.0001, "loss": 11.649, "loss/crossentropy": 2.4984132051467896, "loss/hidden": 4.953125, "loss/jsd": 0.0, "loss/logits": 0.49215397238731384, "step": 294 }, { "epoch": 0.0185, "grad_norm": 6.9375, "grad_norm_var": 1.90260009765625, "learning_rate": 0.0001, "loss": 11.543, "loss/crossentropy": 2.216195821762085, "loss/hidden": 4.796875, "loss/jsd": 0.0, "loss/logits": 0.4642379879951477, "step": 296 }, { "epoch": 0.018625, "grad_norm": 5.9375, "grad_norm_var": 0.85386962890625, "learning_rate": 0.0001, "loss": 11.6498, "loss/crossentropy": 2.4640315771102905, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.4519060105085373, "step": 298 }, { "epoch": 0.01875, "grad_norm": 7.5, "grad_norm_var": 1.0302734375, "learning_rate": 0.0001, "loss": 12.0896, "loss/crossentropy": 2.3119007349014282, "loss/hidden": 4.75, "loss/jsd": 0.0, "loss/logits": 0.4299873411655426, "step": 300 }, { "epoch": 0.018875, "grad_norm": 5.78125, "grad_norm_var": 1.1501302083333333, "learning_rate": 0.0001, "loss": 11.8497, "loss/crossentropy": 2.7904053926467896, "loss/hidden": 5.125, "loss/jsd": 0.0, "loss/logits": 0.4920203685760498, "step": 302 }, { "epoch": 0.019, "grad_norm": 6.6875, "grad_norm_var": 0.5406534830729167, "learning_rate": 0.0001, "loss": 12.3992, "loss/crossentropy": 2.4530467987060547, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.500371515750885, "step": 304 }, { "epoch": 0.019125, "grad_norm": 5.15625, "grad_norm_var": 0.48980712890625, "learning_rate": 0.0001, "loss": 11.2638, "loss/crossentropy": 2.1718756556510925, "loss/hidden": 4.703125, "loss/jsd": 0.0, "loss/logits": 0.3996942490339279, "step": 306 }, { "epoch": 0.01925, "grad_norm": 6.40625, "grad_norm_var": 0.6926717122395833, "learning_rate": 0.0001, "loss": 11.7128, "loss/crossentropy": 2.279876947402954, "loss/hidden": 4.734375, "loss/jsd": 0.0, "loss/logits": 0.44014112651348114, "step": 308 }, { "epoch": 0.019375, "grad_norm": 6.625, "grad_norm_var": 0.9011555989583333, "learning_rate": 0.0001, "loss": 11.7345, "loss/crossentropy": 2.3985207080841064, "loss/hidden": 4.671875, "loss/jsd": 0.0, "loss/logits": 0.447705939412117, "step": 310 }, { "epoch": 0.0195, "grad_norm": 7.21875, "grad_norm_var": 0.9461588541666667, "learning_rate": 0.0001, "loss": 11.8325, "loss/crossentropy": 2.3608609437942505, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.47082260251045227, "step": 312 }, { "epoch": 0.019625, "grad_norm": 5.96875, "grad_norm_var": 0.9327962239583333, "learning_rate": 0.0001, "loss": 12.1212, "loss/crossentropy": 2.4431068897247314, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.46555808186531067, "step": 314 }, { "epoch": 0.01975, "grad_norm": 7.40625, "grad_norm_var": 0.8569661458333333, "learning_rate": 0.0001, "loss": 11.4994, "loss/crossentropy": 2.4739853143692017, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.43313735723495483, "step": 316 }, { "epoch": 0.019875, "grad_norm": 5.75, "grad_norm_var": 0.84625244140625, "learning_rate": 0.0001, "loss": 12.1461, "loss/crossentropy": 2.4454002380371094, "loss/hidden": 4.703125, "loss/jsd": 0.0, "loss/logits": 0.5066866874694824, "step": 318 }, { "epoch": 0.02, "grad_norm": 6.375, "grad_norm_var": 0.873046875, "learning_rate": 0.0001, "loss": 11.8567, "loss/crossentropy": 2.3192704916000366, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.4636172652244568, "step": 320 }, { "epoch": 0.020125, "grad_norm": 5.40625, "grad_norm_var": 0.75709228515625, "learning_rate": 0.0001, "loss": 11.48, "loss/crossentropy": 2.4527477025985718, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.4178178161382675, "step": 322 }, { "epoch": 0.02025, "grad_norm": 5.96875, "grad_norm_var": 0.6018839518229167, "learning_rate": 0.0001, "loss": 11.4718, "loss/crossentropy": 2.553478956222534, "loss/hidden": 4.796875, "loss/jsd": 0.0, "loss/logits": 0.45829740166664124, "step": 324 }, { "epoch": 0.020375, "grad_norm": 5.9375, "grad_norm_var": 0.42291666666666666, "learning_rate": 0.0001, "loss": 11.8038, "loss/crossentropy": 2.6207300424575806, "loss/hidden": 4.75, "loss/jsd": 0.0, "loss/logits": 0.4922682046890259, "step": 326 }, { "epoch": 0.0205, "grad_norm": 5.34375, "grad_norm_var": 0.4051920572916667, "learning_rate": 0.0001, "loss": 11.7729, "loss/crossentropy": 2.642130732536316, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.46317173540592194, "step": 328 }, { "epoch": 0.020625, "grad_norm": 6.28125, "grad_norm_var": 0.38800455729166666, "learning_rate": 0.0001, "loss": 11.2394, "loss/crossentropy": 2.2937039136886597, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.3973730653524399, "step": 330 }, { "epoch": 0.02075, "grad_norm": 5.71875, "grad_norm_var": 0.27858072916666665, "learning_rate": 0.0001, "loss": 11.7345, "loss/crossentropy": 2.193784713745117, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.4304462671279907, "step": 332 }, { "epoch": 0.020875, "grad_norm": 29.75, "grad_norm_var": 35.530985514322914, "learning_rate": 0.0001, "loss": 12.0291, "loss/crossentropy": 2.30152690410614, "loss/hidden": 5.203125, "loss/jsd": 0.0, "loss/logits": 0.5919564962387085, "step": 334 }, { "epoch": 0.021, "grad_norm": 7.5, "grad_norm_var": 35.471577962239586, "learning_rate": 0.0001, "loss": 12.0973, "loss/crossentropy": 2.5532331466674805, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.48507988452911377, "step": 336 }, { "epoch": 0.021125, "grad_norm": 5.5625, "grad_norm_var": 35.56568603515625, "learning_rate": 0.0001, "loss": 11.4957, "loss/crossentropy": 2.365515947341919, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.44307631254196167, "step": 338 }, { "epoch": 0.02125, "grad_norm": 6.40625, "grad_norm_var": 35.37860921223958, "learning_rate": 0.0001, "loss": 11.8233, "loss/crossentropy": 2.4251906871795654, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.49562764167785645, "step": 340 }, { "epoch": 0.021375, "grad_norm": 7.34375, "grad_norm_var": 35.406376139322916, "learning_rate": 0.0001, "loss": 11.7303, "loss/crossentropy": 2.110554575920105, "loss/hidden": 4.828125, "loss/jsd": 0.0, "loss/logits": 0.4632774740457535, "step": 342 }, { "epoch": 0.0215, "grad_norm": 5.53125, "grad_norm_var": 35.112223307291664, "learning_rate": 0.0001, "loss": 11.6951, "loss/crossentropy": 2.600019693374634, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.43400245904922485, "step": 344 }, { "epoch": 0.021625, "grad_norm": 6.375, "grad_norm_var": 34.9974609375, "learning_rate": 0.0001, "loss": 11.4975, "loss/crossentropy": 2.1800973415374756, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.4744419902563095, "step": 346 }, { "epoch": 0.02175, "grad_norm": 5.28125, "grad_norm_var": 35.17460530598958, "learning_rate": 0.0001, "loss": 11.6123, "loss/crossentropy": 2.360541343688965, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.4603952467441559, "step": 348 }, { "epoch": 0.021875, "grad_norm": 6.59375, "grad_norm_var": 0.9956868489583334, "learning_rate": 0.0001, "loss": 11.4711, "loss/crossentropy": 2.4335131645202637, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.45272204279899597, "step": 350 }, { "epoch": 0.022, "grad_norm": 6.5625, "grad_norm_var": 0.276025390625, "learning_rate": 0.0001, "loss": 11.2538, "loss/crossentropy": 2.328977942466736, "loss/hidden": 4.59375, "loss/jsd": 0.0, "loss/logits": 0.41704584658145905, "step": 352 }, { "epoch": 0.022125, "grad_norm": 5.75, "grad_norm_var": 0.26597900390625, "learning_rate": 0.0001, "loss": 11.6315, "loss/crossentropy": 2.308586835861206, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.4296947717666626, "step": 354 }, { "epoch": 0.02225, "grad_norm": 5.875, "grad_norm_var": 0.34490559895833334, "learning_rate": 0.0001, "loss": 11.3326, "loss/crossentropy": 2.6998358964920044, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.4154685437679291, "step": 356 }, { "epoch": 0.022375, "grad_norm": 5.21875, "grad_norm_var": 0.24986572265625, "learning_rate": 0.0001, "loss": 11.3176, "loss/crossentropy": 2.456470251083374, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.4047754108905792, "step": 358 }, { "epoch": 0.0225, "grad_norm": 5.46875, "grad_norm_var": 0.31129150390625, "learning_rate": 0.0001, "loss": 11.3443, "loss/crossentropy": 2.438138961791992, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.43710489571094513, "step": 360 }, { "epoch": 0.022625, "grad_norm": 7.34375, "grad_norm_var": 2.432421875, "learning_rate": 0.0001, "loss": 11.7096, "loss/crossentropy": 2.28983998298645, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.4086810499429703, "step": 362 }, { "epoch": 0.02275, "grad_norm": 5.0625, "grad_norm_var": 2.45611572265625, "learning_rate": 0.0001, "loss": 10.8147, "loss/crossentropy": 2.0908021330833435, "loss/hidden": 4.75, "loss/jsd": 0.0, "loss/logits": 0.4123135209083557, "step": 364 }, { "epoch": 0.022875, "grad_norm": 7.4375, "grad_norm_var": 2.6048828125, "learning_rate": 0.0001, "loss": 11.3904, "loss/crossentropy": 2.5523881912231445, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.4920663833618164, "step": 366 }, { "epoch": 0.023, "grad_norm": 6.625, "grad_norm_var": 2.6137369791666667, "learning_rate": 0.0001, "loss": 11.7594, "loss/crossentropy": 2.5521355867385864, "loss/hidden": 4.59375, "loss/jsd": 0.0, "loss/logits": 0.44777612388134, "step": 368 }, { "epoch": 0.023125, "grad_norm": 5.4375, "grad_norm_var": 2.642899576822917, "learning_rate": 0.0001, "loss": 11.8364, "loss/crossentropy": 2.58060622215271, "loss/hidden": 4.59375, "loss/jsd": 0.0, "loss/logits": 0.43911296129226685, "step": 370 }, { "epoch": 0.02325, "grad_norm": 4.84375, "grad_norm_var": 2.650321451822917, "learning_rate": 0.0001, "loss": 11.3824, "loss/crossentropy": 2.486725926399231, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.4330158978700638, "step": 372 }, { "epoch": 0.023375, "grad_norm": 6.34375, "grad_norm_var": 2.626298014322917, "learning_rate": 0.0001, "loss": 11.2746, "loss/crossentropy": 2.414612293243408, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.4013929069042206, "step": 374 }, { "epoch": 0.0235, "grad_norm": 5.75, "grad_norm_var": 2.6128214518229167, "learning_rate": 0.0001, "loss": 11.1611, "loss/crossentropy": 2.392987370491028, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.40711142122745514, "step": 376 }, { "epoch": 0.023625, "grad_norm": 5.75, "grad_norm_var": 0.43075764973958336, "learning_rate": 0.0001, "loss": 11.2119, "loss/crossentropy": 2.3899725675582886, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.43097464740276337, "step": 378 }, { "epoch": 0.02375, "grad_norm": 5.25, "grad_norm_var": 0.42994384765625, "learning_rate": 0.0001, "loss": 10.7499, "loss/crossentropy": 2.2993377447128296, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.43513306975364685, "step": 380 }, { "epoch": 0.023875, "grad_norm": 5.9375, "grad_norm_var": 0.219384765625, "learning_rate": 0.0001, "loss": 11.2314, "loss/crossentropy": 2.514026165008545, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.45934994518756866, "step": 382 }, { "epoch": 0.024, "grad_norm": 5.1875, "grad_norm_var": 0.16516927083333333, "learning_rate": 0.0001, "loss": 10.9743, "loss/crossentropy": 2.439473509788513, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.41348697245121, "step": 384 }, { "epoch": 0.024125, "grad_norm": 5.15625, "grad_norm_var": 0.21248372395833334, "learning_rate": 0.0001, "loss": 11.0633, "loss/crossentropy": 2.1954495906829834, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.40005022287368774, "step": 386 }, { "epoch": 0.02425, "grad_norm": 5.0, "grad_norm_var": 0.2565714518229167, "learning_rate": 0.0001, "loss": 10.9905, "loss/crossentropy": 2.3529566526412964, "loss/hidden": 4.53125, "loss/jsd": 0.0, "loss/logits": 0.4222729951143265, "step": 388 }, { "epoch": 0.024375, "grad_norm": 6.75, "grad_norm_var": 0.2975870768229167, "learning_rate": 0.0001, "loss": 11.3002, "loss/crossentropy": 2.187886118888855, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.4269304871559143, "step": 390 }, { "epoch": 0.0245, "grad_norm": 5.59375, "grad_norm_var": 0.32405192057291665, "learning_rate": 0.0001, "loss": 10.93, "loss/crossentropy": 2.1569260358810425, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.43037480115890503, "step": 392 }, { "epoch": 0.024625, "grad_norm": 7.34375, "grad_norm_var": 2.1139933268229165, "learning_rate": 0.0001, "loss": 11.8683, "loss/crossentropy": 2.4687070846557617, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.4708428680896759, "step": 394 }, { "epoch": 0.02475, "grad_norm": 5.15625, "grad_norm_var": 2.086844889322917, "learning_rate": 0.0001, "loss": 11.1594, "loss/crossentropy": 2.117295980453491, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.3980236202478409, "step": 396 }, { "epoch": 0.024875, "grad_norm": 7.0, "grad_norm_var": 2.18726806640625, "learning_rate": 0.0001, "loss": 10.9698, "loss/crossentropy": 2.248118758201599, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.38532689213752747, "step": 398 }, { "epoch": 0.025, "grad_norm": 5.25, "grad_norm_var": 2.179427083333333, "learning_rate": 0.0001, "loss": 11.2869, "loss/crossentropy": 2.372201681137085, "loss/hidden": 4.546875, "loss/jsd": 0.0, "loss/logits": 0.5027825832366943, "step": 400 }, { "epoch": 0.025125, "grad_norm": 5.46875, "grad_norm_var": 2.1438639322916666, "learning_rate": 0.0001, "loss": 11.4489, "loss/crossentropy": 2.6094177961349487, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.44682902097702026, "step": 402 }, { "epoch": 0.02525, "grad_norm": 5.6875, "grad_norm_var": 1.952978515625, "learning_rate": 0.0001, "loss": 11.1566, "loss/crossentropy": 2.584525942802429, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.42920687794685364, "step": 404 }, { "epoch": 0.025375, "grad_norm": 5.625, "grad_norm_var": 2.14293212890625, "learning_rate": 0.0001, "loss": 11.5845, "loss/crossentropy": 2.393661141395569, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.49287113547325134, "step": 406 }, { "epoch": 0.0255, "grad_norm": 5.125, "grad_norm_var": 2.09609375, "learning_rate": 0.0001, "loss": 11.3209, "loss/crossentropy": 2.3131325244903564, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.46365103125572205, "step": 408 }, { "epoch": 0.025625, "grad_norm": 7.6875, "grad_norm_var": 0.7451131184895833, "learning_rate": 0.0001, "loss": 11.0967, "loss/crossentropy": 2.297692894935608, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.3878119885921478, "step": 410 }, { "epoch": 0.02575, "grad_norm": 4.8125, "grad_norm_var": 0.790087890625, "learning_rate": 0.0001, "loss": 10.9897, "loss/crossentropy": 2.6333160400390625, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.42297032475471497, "step": 412 }, { "epoch": 0.025875, "grad_norm": 5.28125, "grad_norm_var": 0.6866536458333333, "learning_rate": 0.0001, "loss": 11.1158, "loss/crossentropy": 2.4412989616394043, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.4481000304222107, "step": 414 }, { "epoch": 0.026, "grad_norm": 4.65625, "grad_norm_var": 0.77135009765625, "learning_rate": 0.0001, "loss": 10.9272, "loss/crossentropy": 2.398549199104309, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.45178262889385223, "step": 416 }, { "epoch": 0.026125, "grad_norm": 5.25, "grad_norm_var": 0.8602701822916666, "learning_rate": 0.0001, "loss": 10.8408, "loss/crossentropy": 2.4487680196762085, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.37118175625801086, "step": 418 }, { "epoch": 0.02625, "grad_norm": 5.40625, "grad_norm_var": 0.8687337239583334, "learning_rate": 0.0001, "loss": 10.8116, "loss/crossentropy": 2.2272965908050537, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.4185207635164261, "step": 420 }, { "epoch": 0.026375, "grad_norm": 5.1875, "grad_norm_var": 0.51510009765625, "learning_rate": 0.0001, "loss": 10.7476, "loss/crossentropy": 2.3643970489501953, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.37429845333099365, "step": 422 }, { "epoch": 0.0265, "grad_norm": 4.96875, "grad_norm_var": 0.5231770833333333, "learning_rate": 0.0001, "loss": 10.6487, "loss/crossentropy": 2.3298404216766357, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.397366538643837, "step": 424 }, { "epoch": 0.026625, "grad_norm": 8.1875, "grad_norm_var": 0.6913045247395834, "learning_rate": 0.0001, "loss": 10.711, "loss/crossentropy": 2.288890838623047, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.40261921286582947, "step": 426 }, { "epoch": 0.02675, "grad_norm": 5.40625, "grad_norm_var": 0.6659505208333333, "learning_rate": 0.0001, "loss": 11.2139, "loss/crossentropy": 2.2884762287139893, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.4873107075691223, "step": 428 }, { "epoch": 0.026875, "grad_norm": 5.9375, "grad_norm_var": 0.666650390625, "learning_rate": 0.0001, "loss": 11.3032, "loss/crossentropy": 2.7229325771331787, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.42428141832351685, "step": 430 }, { "epoch": 0.027, "grad_norm": 5.15625, "grad_norm_var": 0.6202962239583333, "learning_rate": 0.0001, "loss": 10.9775, "loss/crossentropy": 2.306090831756592, "loss/hidden": 4.34375, "loss/jsd": 0.0, "loss/logits": 0.38204696774482727, "step": 432 }, { "epoch": 0.027125, "grad_norm": 12.1875, "grad_norm_var": 3.175581868489583, "learning_rate": 0.0001, "loss": 11.2721, "loss/crossentropy": 2.2174941301345825, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.3863690495491028, "step": 434 }, { "epoch": 0.02725, "grad_norm": 5.6875, "grad_norm_var": 3.265327962239583, "learning_rate": 0.0001, "loss": 11.42, "loss/crossentropy": 2.6020652055740356, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.42650310695171356, "step": 436 }, { "epoch": 0.027375, "grad_norm": 5.75, "grad_norm_var": 3.2246053059895834, "learning_rate": 0.0001, "loss": 11.062, "loss/crossentropy": 2.5146955251693726, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.4270920157432556, "step": 438 }, { "epoch": 0.0275, "grad_norm": 6.125, "grad_norm_var": 3.0216105143229166, "learning_rate": 0.0001, "loss": 10.8782, "loss/crossentropy": 2.47203266620636, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.42113421857357025, "step": 440 }, { "epoch": 0.027625, "grad_norm": 6.03125, "grad_norm_var": 2.8958170572916666, "learning_rate": 0.0001, "loss": 10.9403, "loss/crossentropy": 2.349884510040283, "loss/hidden": 4.34375, "loss/jsd": 0.0, "loss/logits": 0.41682717204093933, "step": 442 }, { "epoch": 0.02775, "grad_norm": 4.59375, "grad_norm_var": 3.076888020833333, "learning_rate": 0.0001, "loss": 10.4797, "loss/crossentropy": 2.092501401901245, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.363692969083786, "step": 444 }, { "epoch": 0.027875, "grad_norm": 7.0, "grad_norm_var": 3.0968587239583334, "learning_rate": 0.0001, "loss": 11.1636, "loss/crossentropy": 2.487968325614929, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.4445933848619461, "step": 446 }, { "epoch": 0.028, "grad_norm": 5.15625, "grad_norm_var": 3.223465983072917, "learning_rate": 0.0001, "loss": 10.6807, "loss/crossentropy": 2.018888473510742, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3722313791513443, "step": 448 }, { "epoch": 0.028125, "grad_norm": 8.0, "grad_norm_var": 0.9498331705729167, "learning_rate": 0.0001, "loss": 11.0866, "loss/crossentropy": 2.4356439113616943, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.3904484063386917, "step": 450 }, { "epoch": 0.02825, "grad_norm": 5.6875, "grad_norm_var": 0.764306640625, "learning_rate": 0.0001, "loss": 11.0359, "loss/crossentropy": 2.2291282415390015, "loss/hidden": 4.734375, "loss/jsd": 0.0, "loss/logits": 0.4464022219181061, "step": 452 }, { "epoch": 0.028375, "grad_norm": 4.40625, "grad_norm_var": 0.88013916015625, "learning_rate": 0.0001, "loss": 10.6204, "loss/crossentropy": 2.1905126571655273, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.3691885471343994, "step": 454 }, { "epoch": 0.0285, "grad_norm": 4.53125, "grad_norm_var": 0.87115478515625, "learning_rate": 0.0001, "loss": 10.7509, "loss/crossentropy": 2.4307353496551514, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.3945564180612564, "step": 456 }, { "epoch": 0.028625, "grad_norm": 4.6875, "grad_norm_var": 0.8925618489583333, "learning_rate": 0.0001, "loss": 10.3901, "loss/crossentropy": 2.26895272731781, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.4020170420408249, "step": 458 }, { "epoch": 0.02875, "grad_norm": 6.03125, "grad_norm_var": 0.8680989583333333, "learning_rate": 0.0001, "loss": 10.8526, "loss/crossentropy": 2.686508536338806, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.39317409694194794, "step": 460 }, { "epoch": 0.028875, "grad_norm": 4.71875, "grad_norm_var": 0.7363118489583333, "learning_rate": 0.0001, "loss": 10.6288, "loss/crossentropy": 2.7407588958740234, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.4108298718929291, "step": 462 }, { "epoch": 0.029, "grad_norm": 5.15625, "grad_norm_var": 0.7388956705729167, "learning_rate": 0.0001, "loss": 10.6546, "loss/crossentropy": 2.4361412525177, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.38222774863243103, "step": 464 }, { "epoch": 0.029125, "grad_norm": 4.90625, "grad_norm_var": 0.24388020833333332, "learning_rate": 0.0001, "loss": 10.6433, "loss/crossentropy": 2.154534697532654, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.3813844919204712, "step": 466 }, { "epoch": 0.02925, "grad_norm": 5.375, "grad_norm_var": 0.20162760416666667, "learning_rate": 0.0001, "loss": 10.5563, "loss/crossentropy": 2.341228723526001, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.415422186255455, "step": 468 }, { "epoch": 0.029375, "grad_norm": 4.625, "grad_norm_var": 0.24954427083333333, "learning_rate": 0.0001, "loss": 10.6267, "loss/crossentropy": 2.220223307609558, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.40844734013080597, "step": 470 }, { "epoch": 0.0295, "grad_norm": 5.4375, "grad_norm_var": 0.22665608723958333, "learning_rate": 0.0001, "loss": 10.7236, "loss/crossentropy": 2.2939720153808594, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.41516372561454773, "step": 472 }, { "epoch": 0.029625, "grad_norm": 5.53125, "grad_norm_var": 0.19724934895833332, "learning_rate": 0.0001, "loss": 10.7759, "loss/crossentropy": 2.204783082008362, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.3852095752954483, "step": 474 }, { "epoch": 0.02975, "grad_norm": 4.4375, "grad_norm_var": 0.179541015625, "learning_rate": 0.0001, "loss": 10.5148, "loss/crossentropy": 2.222555994987488, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.3944680392742157, "step": 476 }, { "epoch": 0.029875, "grad_norm": 5.71875, "grad_norm_var": 0.17730712890625, "learning_rate": 0.0001, "loss": 10.8371, "loss/crossentropy": 2.109412908554077, "loss/hidden": 4.59375, "loss/jsd": 0.0, "loss/logits": 0.41633202135562897, "step": 478 }, { "epoch": 0.03, "grad_norm": 5.40625, "grad_norm_var": 0.17258707682291666, "learning_rate": 0.0001, "loss": 10.4084, "loss/crossentropy": 2.3310940265655518, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.36518849432468414, "step": 480 }, { "epoch": 0.030125, "grad_norm": 5.90625, "grad_norm_var": 0.20506184895833332, "learning_rate": 0.0001, "loss": 11.0288, "loss/crossentropy": 2.5294687747955322, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.4224875122308731, "step": 482 }, { "epoch": 0.03025, "grad_norm": 5.34375, "grad_norm_var": 0.20452067057291667, "learning_rate": 0.0001, "loss": 10.427, "loss/crossentropy": 2.448809266090393, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.37274371087551117, "step": 484 }, { "epoch": 0.030375, "grad_norm": 15.125, "grad_norm_var": 6.214676920572916, "learning_rate": 0.0001, "loss": 10.7951, "loss/crossentropy": 2.407306671142578, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.4623569846153259, "step": 486 }, { "epoch": 0.0305, "grad_norm": 4.71875, "grad_norm_var": 6.266630045572916, "learning_rate": 0.0001, "loss": 10.6229, "loss/crossentropy": 2.184267520904541, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.384739488363266, "step": 488 }, { "epoch": 0.030625, "grad_norm": 5.59375, "grad_norm_var": 6.216259765625, "learning_rate": 0.0001, "loss": 11.05, "loss/crossentropy": 2.283589720726013, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.3847021609544754, "step": 490 }, { "epoch": 0.03075, "grad_norm": 5.40625, "grad_norm_var": 6.065738932291667, "learning_rate": 0.0001, "loss": 10.6189, "loss/crossentropy": 2.257542133331299, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.39529381692409515, "step": 492 }, { "epoch": 0.030875, "grad_norm": 5.09375, "grad_norm_var": 6.103255208333334, "learning_rate": 0.0001, "loss": 10.7398, "loss/crossentropy": 2.58119535446167, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.394280344247818, "step": 494 }, { "epoch": 0.031, "grad_norm": 13.0, "grad_norm_var": 9.049983723958333, "learning_rate": 0.0001, "loss": 10.6087, "loss/crossentropy": 2.2886345386505127, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.3691941201686859, "step": 496 }, { "epoch": 0.031125, "grad_norm": 6.28125, "grad_norm_var": 8.890999348958333, "learning_rate": 0.0001, "loss": 10.7293, "loss/crossentropy": 2.303394079208374, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4062899351119995, "step": 498 }, { "epoch": 0.03125, "grad_norm": 5.5, "grad_norm_var": 8.947119140625, "learning_rate": 0.0001, "loss": 10.4362, "loss/crossentropy": 2.66743266582489, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3696695417165756, "step": 500 }, { "epoch": 0.031375, "grad_norm": 4.875, "grad_norm_var": 3.8385050455729166, "learning_rate": 0.0001, "loss": 10.3541, "loss/crossentropy": 2.4178144931793213, "loss/hidden": 4.34375, "loss/jsd": 0.0, "loss/logits": 0.42277923226356506, "step": 502 }, { "epoch": 0.0315, "grad_norm": 5.28125, "grad_norm_var": 3.7421223958333334, "learning_rate": 0.0001, "loss": 11.0164, "loss/crossentropy": 2.661468505859375, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.5325552225112915, "step": 504 }, { "epoch": 0.031625, "grad_norm": 4.90625, "grad_norm_var": 3.8019368489583334, "learning_rate": 0.0001, "loss": 10.6102, "loss/crossentropy": 2.522894263267517, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4011620730161667, "step": 506 }, { "epoch": 0.03175, "grad_norm": 5.1875, "grad_norm_var": 3.8270833333333334, "learning_rate": 0.0001, "loss": 10.7316, "loss/crossentropy": 2.588345766067505, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.4037622660398483, "step": 508 }, { "epoch": 0.031875, "grad_norm": 5.1875, "grad_norm_var": 3.8101521809895833, "learning_rate": 0.0001, "loss": 10.8309, "loss/crossentropy": 2.4223917722702026, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.40193139016628265, "step": 510 }, { "epoch": 0.032, "grad_norm": 4.9375, "grad_norm_var": 0.22420247395833334, "learning_rate": 0.0001, "loss": 10.2842, "loss/crossentropy": 2.045061230659485, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.3979046642780304, "step": 512 }, { "epoch": 0.032125, "grad_norm": 5.0, "grad_norm_var": 0.18394775390625, "learning_rate": 0.0001, "loss": 10.4619, "loss/crossentropy": 2.1112935543060303, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3597547709941864, "step": 514 }, { "epoch": 0.03225, "grad_norm": 4.75, "grad_norm_var": 0.18990885416666667, "learning_rate": 0.0001, "loss": 10.4796, "loss/crossentropy": 2.226701498031616, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.4143287241458893, "step": 516 }, { "epoch": 0.032375, "grad_norm": 7.3125, "grad_norm_var": 0.47237955729166664, "learning_rate": 0.0001, "loss": 10.7233, "loss/crossentropy": 2.662535786628723, "loss/hidden": 4.34375, "loss/jsd": 0.0, "loss/logits": 0.3921970725059509, "step": 518 }, { "epoch": 0.0325, "grad_norm": 5.25, "grad_norm_var": 0.4427083333333333, "learning_rate": 0.0001, "loss": 10.5476, "loss/crossentropy": 2.2924622297286987, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.41019099950790405, "step": 520 }, { "epoch": 0.032625, "grad_norm": 4.59375, "grad_norm_var": 0.4473795572916667, "learning_rate": 0.0001, "loss": 10.6009, "loss/crossentropy": 2.420462727546692, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.4323180317878723, "step": 522 }, { "epoch": 0.03275, "grad_norm": 4.90625, "grad_norm_var": 0.47433268229166664, "learning_rate": 0.0001, "loss": 10.5171, "loss/crossentropy": 2.481095314025879, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.4323504865169525, "step": 524 }, { "epoch": 0.032875, "grad_norm": 6.25, "grad_norm_var": 3.908707682291667, "learning_rate": 0.0001, "loss": 10.7184, "loss/crossentropy": 2.2674002647399902, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.4381224364042282, "step": 526 }, { "epoch": 0.033, "grad_norm": 4.75, "grad_norm_var": 3.859375, "learning_rate": 0.0001, "loss": 10.6699, "loss/crossentropy": 2.455391049385071, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.36728164553642273, "step": 528 }, { "epoch": 0.033125, "grad_norm": 5.34375, "grad_norm_var": 3.804541015625, "learning_rate": 0.0001, "loss": 10.6111, "loss/crossentropy": 2.44006884098053, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.4682173430919647, "step": 530 }, { "epoch": 0.03325, "grad_norm": 26.25, "grad_norm_var": 29.899853515625, "learning_rate": 0.0001, "loss": 11.2501, "loss/crossentropy": 2.378816246986389, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.40724216401576996, "step": 532 }, { "epoch": 0.033375, "grad_norm": 4.875, "grad_norm_var": 30.194986979166668, "learning_rate": 0.0001, "loss": 10.8243, "loss/crossentropy": 2.361769199371338, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.38719530403614044, "step": 534 }, { "epoch": 0.0335, "grad_norm": 4.59375, "grad_norm_var": 30.469559733072916, "learning_rate": 0.0001, "loss": 10.4101, "loss/crossentropy": 2.200801968574524, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.3492705821990967, "step": 536 }, { "epoch": 0.033625, "grad_norm": 5.0625, "grad_norm_var": 30.48599853515625, "learning_rate": 0.0001, "loss": 10.3478, "loss/crossentropy": 2.065083146095276, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3540280908346176, "step": 538 }, { "epoch": 0.03375, "grad_norm": 4.875, "grad_norm_var": 30.33671875, "learning_rate": 0.0001, "loss": 10.6963, "loss/crossentropy": 2.5465164184570312, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.44140176475048065, "step": 540 }, { "epoch": 0.033875, "grad_norm": 4.96875, "grad_norm_var": 28.386702473958334, "learning_rate": 0.0001, "loss": 10.5878, "loss/crossentropy": 2.499788999557495, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.43500569462776184, "step": 542 }, { "epoch": 0.034, "grad_norm": 5.15625, "grad_norm_var": 28.394136555989583, "learning_rate": 0.0001, "loss": 10.689, "loss/crossentropy": 2.4177205562591553, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.35840822756290436, "step": 544 }, { "epoch": 0.034125, "grad_norm": 5.84375, "grad_norm_var": 28.418473307291666, "learning_rate": 0.0001, "loss": 10.299, "loss/crossentropy": 2.4439547061920166, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.32660411298274994, "step": 546 }, { "epoch": 0.03425, "grad_norm": 5.71875, "grad_norm_var": 0.76119384765625, "learning_rate": 0.0001, "loss": 10.3815, "loss/crossentropy": 2.228819489479065, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.3849843740463257, "step": 548 }, { "epoch": 0.034375, "grad_norm": 4.46875, "grad_norm_var": 0.20178629557291666, "learning_rate": 0.0001, "loss": 10.7005, "loss/crossentropy": 2.2744678258895874, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.35007524490356445, "step": 550 }, { "epoch": 0.0345, "grad_norm": 4.65625, "grad_norm_var": 0.20432535807291666, "learning_rate": 0.0001, "loss": 10.5667, "loss/crossentropy": 2.6473900079727173, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.40475085377693176, "step": 552 }, { "epoch": 0.034625, "grad_norm": 5.0, "grad_norm_var": 0.19693603515625, "learning_rate": 0.0001, "loss": 10.2846, "loss/crossentropy": 2.212767481803894, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.374579593539238, "step": 554 }, { "epoch": 0.03475, "grad_norm": 4.78125, "grad_norm_var": 0.24107666015625, "learning_rate": 0.0001, "loss": 10.1355, "loss/crossentropy": 2.3621898889541626, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.36958175897598267, "step": 556 }, { "epoch": 0.034875, "grad_norm": 6.125, "grad_norm_var": 0.42678629557291664, "learning_rate": 0.0001, "loss": 10.7711, "loss/crossentropy": 2.649366617202759, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.3781619817018509, "step": 558 }, { "epoch": 0.035, "grad_norm": 4.96875, "grad_norm_var": 0.41321207682291666, "learning_rate": 0.0001, "loss": 10.4759, "loss/crossentropy": 2.4452648162841797, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.4071914702653885, "step": 560 }, { "epoch": 0.035125, "grad_norm": 4.59375, "grad_norm_var": 0.3868326822916667, "learning_rate": 0.0001, "loss": 10.6421, "loss/crossentropy": 2.2176308631896973, "loss/hidden": 4.34375, "loss/jsd": 0.0, "loss/logits": 0.3943832516670227, "step": 562 }, { "epoch": 0.03525, "grad_norm": 5.125, "grad_norm_var": 0.38201497395833334, "learning_rate": 0.0001, "loss": 9.9568, "loss/crossentropy": 2.2056097984313965, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.36127614974975586, "step": 564 }, { "epoch": 0.035375, "grad_norm": 4.625, "grad_norm_var": 0.36392822265625, "learning_rate": 0.0001, "loss": 10.1832, "loss/crossentropy": 2.1807271242141724, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3716702163219452, "step": 566 }, { "epoch": 0.0355, "grad_norm": 4.65625, "grad_norm_var": 0.365234375, "learning_rate": 0.0001, "loss": 10.2836, "loss/crossentropy": 2.2167818546295166, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.36115220189094543, "step": 568 }, { "epoch": 0.035625, "grad_norm": 6.0625, "grad_norm_var": 0.46578369140625, "learning_rate": 0.0001, "loss": 10.1765, "loss/crossentropy": 2.1413118839263916, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.36932238936424255, "step": 570 }, { "epoch": 0.03575, "grad_norm": 4.5, "grad_norm_var": 0.45992431640625, "learning_rate": 0.0001, "loss": 10.205, "loss/crossentropy": 2.493895649909973, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.37394069135189056, "step": 572 }, { "epoch": 0.035875, "grad_norm": 4.8125, "grad_norm_var": 0.22981363932291668, "learning_rate": 0.0001, "loss": 10.4602, "loss/crossentropy": 2.2241677045822144, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3733552098274231, "step": 574 }, { "epoch": 0.036, "grad_norm": 5.21875, "grad_norm_var": 0.23527018229166666, "learning_rate": 0.0001, "loss": 9.9859, "loss/crossentropy": 2.390074133872986, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.39679403603076935, "step": 576 }, { "epoch": 0.036125, "grad_norm": 4.71875, "grad_norm_var": 0.231640625, "learning_rate": 0.0001, "loss": 10.0715, "loss/crossentropy": 2.3841971158981323, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.3936457931995392, "step": 578 }, { "epoch": 0.03625, "grad_norm": 4.59375, "grad_norm_var": 0.19023030598958332, "learning_rate": 0.0001, "loss": 10.2935, "loss/crossentropy": 2.038338541984558, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.361712321639061, "step": 580 }, { "epoch": 0.036375, "grad_norm": 4.03125, "grad_norm_var": 0.25833333333333336, "learning_rate": 0.0001, "loss": 9.8959, "loss/crossentropy": 2.143701195716858, "loss/hidden": 4.0234375, "loss/jsd": 0.0, "loss/logits": 0.35893675684928894, "step": 582 }, { "epoch": 0.0365, "grad_norm": 9.3125, "grad_norm_var": 1.5278483072916667, "learning_rate": 0.0001, "loss": 10.0876, "loss/crossentropy": 2.36753249168396, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.44795021414756775, "step": 584 }, { "epoch": 0.036625, "grad_norm": 5.40625, "grad_norm_var": 1.4960896809895834, "learning_rate": 0.0001, "loss": 10.5745, "loss/crossentropy": 2.5945348739624023, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.37108245491981506, "step": 586 }, { "epoch": 0.03675, "grad_norm": 4.375, "grad_norm_var": 1.5137858072916666, "learning_rate": 0.0001, "loss": 10.2369, "loss/crossentropy": 2.330011487007141, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.356001615524292, "step": 588 }, { "epoch": 0.036875, "grad_norm": 5.5625, "grad_norm_var": 1.5341105143229166, "learning_rate": 0.0001, "loss": 10.6198, "loss/crossentropy": 2.4496524333953857, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3684788942337036, "step": 590 }, { "epoch": 0.037, "grad_norm": 5.0625, "grad_norm_var": 1.5228800455729166, "learning_rate": 0.0001, "loss": 10.3633, "loss/crossentropy": 2.3266873359680176, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.35862766206264496, "step": 592 }, { "epoch": 0.037125, "grad_norm": 4.96875, "grad_norm_var": 1.5251261393229167, "learning_rate": 0.0001, "loss": 10.1345, "loss/crossentropy": 2.3481708765029907, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3875860273838043, "step": 594 }, { "epoch": 0.03725, "grad_norm": 6.4375, "grad_norm_var": 1.60181884765625, "learning_rate": 0.0001, "loss": 10.4745, "loss/crossentropy": 2.4021466970443726, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.41785581409931183, "step": 596 }, { "epoch": 0.037375, "grad_norm": 6.09375, "grad_norm_var": 1.4307902018229166, "learning_rate": 0.0001, "loss": 10.264, "loss/crossentropy": 2.1973708868026733, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.3709661662578583, "step": 598 }, { "epoch": 0.0375, "grad_norm": 4.78125, "grad_norm_var": 0.37604166666666666, "learning_rate": 0.0001, "loss": 10.3662, "loss/crossentropy": 2.2258559465408325, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.4044695645570755, "step": 600 }, { "epoch": 0.037625, "grad_norm": 5.09375, "grad_norm_var": 0.3697265625, "learning_rate": 0.0001, "loss": 10.5172, "loss/crossentropy": 2.6148056983947754, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.379873663187027, "step": 602 }, { "epoch": 0.03775, "grad_norm": 4.96875, "grad_norm_var": 0.32936197916666665, "learning_rate": 0.0001, "loss": 10.7876, "loss/crossentropy": 2.396528124809265, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.39409296214580536, "step": 604 }, { "epoch": 0.037875, "grad_norm": 5.46875, "grad_norm_var": 0.34065348307291665, "learning_rate": 0.0001, "loss": 10.3279, "loss/crossentropy": 2.234705090522766, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3851456344127655, "step": 606 }, { "epoch": 0.038, "grad_norm": 5.09375, "grad_norm_var": 0.3410115559895833, "learning_rate": 0.0001, "loss": 10.2649, "loss/crossentropy": 2.3476482629776, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.4069204777479172, "step": 608 }, { "epoch": 0.038125, "grad_norm": 4.8125, "grad_norm_var": 0.307666015625, "learning_rate": 0.0001, "loss": 10.52, "loss/crossentropy": 2.314854621887207, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.40154193341732025, "step": 610 }, { "epoch": 0.03825, "grad_norm": 5.625, "grad_norm_var": 0.253125, "learning_rate": 0.0001, "loss": 10.3055, "loss/crossentropy": 2.1955525875091553, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3835551291704178, "step": 612 }, { "epoch": 0.038375, "grad_norm": 4.1875, "grad_norm_var": 0.2865193684895833, "learning_rate": 0.0001, "loss": 10.3296, "loss/crossentropy": 2.3446608781814575, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.37230314314365387, "step": 614 }, { "epoch": 0.0385, "grad_norm": 5.75, "grad_norm_var": 242.49685872395833, "learning_rate": 0.0001, "loss": 10.9956, "loss/crossentropy": 2.406968116760254, "loss/hidden": 4.0859375, "loss/jsd": 0.0, "loss/logits": 0.3518829941749573, "step": 616 }, { "epoch": 0.038625, "grad_norm": 5.125, "grad_norm_var": 243.28655192057292, "learning_rate": 0.0001, "loss": 10.228, "loss/crossentropy": 2.2349104285240173, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.390815794467926, "step": 618 }, { "epoch": 0.03875, "grad_norm": 5.4375, "grad_norm_var": 243.72745768229166, "learning_rate": 0.0001, "loss": 10.3093, "loss/crossentropy": 1.9635624885559082, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.3205077052116394, "step": 620 }, { "epoch": 0.038875, "grad_norm": 4.65625, "grad_norm_var": 244.0064453125, "learning_rate": 0.0001, "loss": 10.2132, "loss/crossentropy": 2.149289608001709, "loss/hidden": 4.0234375, "loss/jsd": 0.0, "loss/logits": 0.3335840404033661, "step": 622 }, { "epoch": 0.039, "grad_norm": 4.78125, "grad_norm_var": 244.57864583333333, "learning_rate": 0.0001, "loss": 10.4193, "loss/crossentropy": 2.5327214002609253, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.37918125092983246, "step": 624 }, { "epoch": 0.039125, "grad_norm": 4.6875, "grad_norm_var": 245.23917643229166, "learning_rate": 0.0001, "loss": 9.9624, "loss/crossentropy": 2.457598328590393, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.35679344832897186, "step": 626 }, { "epoch": 0.03925, "grad_norm": 5.59375, "grad_norm_var": 244.75501302083333, "learning_rate": 0.0001, "loss": 10.2841, "loss/crossentropy": 2.488083243370056, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.4154289662837982, "step": 628 }, { "epoch": 0.039375, "grad_norm": 4.375, "grad_norm_var": 244.31011962890625, "learning_rate": 0.0001, "loss": 10.1887, "loss/crossentropy": 2.2506699562072754, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.35755500197410583, "step": 630 }, { "epoch": 0.0395, "grad_norm": 4.65625, "grad_norm_var": 0.31691080729166665, "learning_rate": 0.0001, "loss": 10.1679, "loss/crossentropy": 2.3028769493103027, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3798653483390808, "step": 632 }, { "epoch": 0.039625, "grad_norm": 4.75, "grad_norm_var": 0.35611572265625, "learning_rate": 0.0001, "loss": 10.068, "loss/crossentropy": 2.3498564958572388, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3759974241256714, "step": 634 }, { "epoch": 0.03975, "grad_norm": 5.65625, "grad_norm_var": 0.37701822916666666, "learning_rate": 0.0001, "loss": 10.1594, "loss/crossentropy": 2.381397247314453, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.33796676993370056, "step": 636 }, { "epoch": 0.039875, "grad_norm": 4.96875, "grad_norm_var": 0.40089518229166665, "learning_rate": 0.0001, "loss": 9.9976, "loss/crossentropy": 2.106156349182129, "loss/hidden": 4.0390625, "loss/jsd": 0.0, "loss/logits": 0.3315333127975464, "step": 638 }, { "epoch": 0.04, "grad_norm": 4.5, "grad_norm_var": 0.41092122395833336, "learning_rate": 0.0001, "loss": 10.1764, "loss/crossentropy": 2.0783875584602356, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3635980039834976, "step": 640 }, { "epoch": 0.040125, "grad_norm": 5.0, "grad_norm_var": 1.6003743489583333, "learning_rate": 0.0001, "loss": 11.6475, "loss/crossentropy": 2.2979753017425537, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.3724391460418701, "step": 642 }, { "epoch": 0.04025, "grad_norm": 4.46875, "grad_norm_var": 1.6145792643229167, "learning_rate": 0.0001, "loss": 9.6654, "loss/crossentropy": 2.376120924949646, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.34977029263973236, "step": 644 }, { "epoch": 0.040375, "grad_norm": 4.875, "grad_norm_var": 1.522900390625, "learning_rate": 0.0001, "loss": 10.4999, "loss/crossentropy": 2.439871311187744, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.4230840504169464, "step": 646 }, { "epoch": 0.0405, "grad_norm": 4.34375, "grad_norm_var": 1.6555948893229167, "learning_rate": 0.0001, "loss": 10.0302, "loss/crossentropy": 2.1236563324928284, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.31898312270641327, "step": 648 }, { "epoch": 0.040625, "grad_norm": 12.0, "grad_norm_var": 4.60338134765625, "learning_rate": 0.0001, "loss": 10.3265, "loss/crossentropy": 2.3450835943222046, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.39976833760738373, "step": 650 }, { "epoch": 0.04075, "grad_norm": 4.21875, "grad_norm_var": 4.690755208333333, "learning_rate": 0.0001, "loss": 10.1511, "loss/crossentropy": 2.255510449409485, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.37268275022506714, "step": 652 }, { "epoch": 0.040875, "grad_norm": 13.0625, "grad_norm_var": 8.293290201822916, "learning_rate": 0.0001, "loss": 10.1556, "loss/crossentropy": 2.468876004219055, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.36706840991973877, "step": 654 }, { "epoch": 0.041, "grad_norm": 4.4375, "grad_norm_var": 8.233968098958334, "learning_rate": 0.0001, "loss": 10.1624, "loss/crossentropy": 2.4718987941741943, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.36339423060417175, "step": 656 }, { "epoch": 0.041125, "grad_norm": 4.65625, "grad_norm_var": 7.510026041666666, "learning_rate": 0.0001, "loss": 10.0571, "loss/crossentropy": 2.3693588972091675, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.34288500249385834, "step": 658 }, { "epoch": 0.04125, "grad_norm": 4.8125, "grad_norm_var": 7.539253743489583, "learning_rate": 0.0001, "loss": 9.9376, "loss/crossentropy": 2.22864305973053, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3700525462627411, "step": 660 }, { "epoch": 0.041375, "grad_norm": 5.34375, "grad_norm_var": 8.079423014322916, "learning_rate": 0.0001, "loss": 10.27, "loss/crossentropy": 2.3558337688446045, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.39019152522087097, "step": 662 }, { "epoch": 0.0415, "grad_norm": 4.9375, "grad_norm_var": 7.692708333333333, "learning_rate": 0.0001, "loss": 10.43, "loss/crossentropy": 2.326426863670349, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.4063562750816345, "step": 664 }, { "epoch": 0.041625, "grad_norm": 6.3125, "grad_norm_var": 21.371089680989584, "learning_rate": 0.0001, "loss": 10.7808, "loss/crossentropy": 2.1193381547927856, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.4385177791118622, "step": 666 }, { "epoch": 0.04175, "grad_norm": 4.59375, "grad_norm_var": 21.2443359375, "learning_rate": 0.0001, "loss": 10.0136, "loss/crossentropy": 2.233449101448059, "loss/hidden": 4.0859375, "loss/jsd": 0.0, "loss/logits": 0.3605906367301941, "step": 668 }, { "epoch": 0.041875, "grad_norm": 4.65625, "grad_norm_var": 18.495829264322918, "learning_rate": 0.0001, "loss": 10.0565, "loss/crossentropy": 2.168951630592346, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3409145474433899, "step": 670 }, { "epoch": 0.042, "grad_norm": 5.03125, "grad_norm_var": 21.037760416666668, "learning_rate": 0.0001, "loss": 10.6652, "loss/crossentropy": 2.589960813522339, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.43253809213638306, "step": 672 }, { "epoch": 0.042125, "grad_norm": 5.25, "grad_norm_var": 20.730562337239583, "learning_rate": 0.0001, "loss": 10.057, "loss/crossentropy": 2.16236674785614, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3319668024778366, "step": 674 }, { "epoch": 0.04225, "grad_norm": 5.46875, "grad_norm_var": 20.581624348958332, "learning_rate": 0.0001, "loss": 9.9368, "loss/crossentropy": 2.5094408988952637, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.363776296377182, "step": 676 }, { "epoch": 0.042375, "grad_norm": 5.03125, "grad_norm_var": 20.604622395833335, "learning_rate": 0.0001, "loss": 10.1039, "loss/crossentropy": 2.2195874452590942, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3527718633413315, "step": 678 }, { "epoch": 0.0425, "grad_norm": 4.8125, "grad_norm_var": 20.883186848958335, "learning_rate": 0.0001, "loss": 9.9975, "loss/crossentropy": 2.1230783462524414, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.3537606745958328, "step": 680 }, { "epoch": 0.042625, "grad_norm": 4.71875, "grad_norm_var": 4.101753743489583, "learning_rate": 0.0001, "loss": 9.7443, "loss/crossentropy": 2.0430566668510437, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.35527075827121735, "step": 682 }, { "epoch": 0.04275, "grad_norm": 4.15625, "grad_norm_var": 4.253641764322917, "learning_rate": 0.0001, "loss": 9.7582, "loss/crossentropy": 2.294941544532776, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.36452287435531616, "step": 684 }, { "epoch": 0.042875, "grad_norm": 4.96875, "grad_norm_var": 4.21929931640625, "learning_rate": 0.0001, "loss": 10.0321, "loss/crossentropy": 2.300116777420044, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3483743965625763, "step": 686 }, { "epoch": 0.043, "grad_norm": 5.03125, "grad_norm_var": 0.14621988932291666, "learning_rate": 0.0001, "loss": 10.0473, "loss/crossentropy": 2.39257276058197, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.38009554147720337, "step": 688 }, { "epoch": 0.043125, "grad_norm": 4.15625, "grad_norm_var": 0.15354410807291666, "learning_rate": 0.0001, "loss": 10.0047, "loss/crossentropy": 2.7224282026290894, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3659520596265793, "step": 690 }, { "epoch": 0.04325, "grad_norm": 4.1875, "grad_norm_var": 0.16144205729166666, "learning_rate": 0.0001, "loss": 9.8552, "loss/crossentropy": 2.3452165126800537, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.341473788022995, "step": 692 }, { "epoch": 0.043375, "grad_norm": 3.984375, "grad_norm_var": 0.16150614420572917, "learning_rate": 0.0001, "loss": 9.9273, "loss/crossentropy": 2.3679224252700806, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.36363688111305237, "step": 694 }, { "epoch": 0.0435, "grad_norm": 4.25, "grad_norm_var": 0.1602447509765625, "learning_rate": 0.0001, "loss": 9.8577, "loss/crossentropy": 2.3564072847366333, "loss/hidden": 4.0234375, "loss/jsd": 0.0, "loss/logits": 0.36827197670936584, "step": 696 }, { "epoch": 0.043625, "grad_norm": 6.84375, "grad_norm_var": 0.5215321858723958, "learning_rate": 0.0001, "loss": 9.9648, "loss/crossentropy": 2.2375397086143494, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3416619747877121, "step": 698 }, { "epoch": 0.04375, "grad_norm": 5.65625, "grad_norm_var": 1.5632314046223958, "learning_rate": 0.0001, "loss": 10.3779, "loss/crossentropy": 2.6283761262893677, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3753877878189087, "step": 700 }, { "epoch": 0.043875, "grad_norm": 6.0, "grad_norm_var": 1.7005360921223958, "learning_rate": 0.0001, "loss": 10.2408, "loss/crossentropy": 2.450525403022766, "loss/hidden": 4.0859375, "loss/jsd": 0.0, "loss/logits": 0.3759635388851166, "step": 702 }, { "epoch": 0.044, "grad_norm": 4.15625, "grad_norm_var": 1.7672434488932292, "learning_rate": 0.0001, "loss": 9.6788, "loss/crossentropy": 2.2674474716186523, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.3708268404006958, "step": 704 }, { "epoch": 0.044125, "grad_norm": 4.78125, "grad_norm_var": 1.7210439046223958, "learning_rate": 0.0001, "loss": 10.2047, "loss/crossentropy": 2.3761132955551147, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.33222293853759766, "step": 706 }, { "epoch": 0.04425, "grad_norm": 4.5, "grad_norm_var": 1.619823201497396, "learning_rate": 0.0001, "loss": 9.9987, "loss/crossentropy": 2.403549075126648, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.3612934798002243, "step": 708 }, { "epoch": 0.044375, "grad_norm": 4.46875, "grad_norm_var": 1.5661295572916667, "learning_rate": 0.0001, "loss": 10.1588, "loss/crossentropy": 2.25975239276886, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.34882494807243347, "step": 710 }, { "epoch": 0.0445, "grad_norm": 5.5, "grad_norm_var": 1.514697265625, "learning_rate": 0.0001, "loss": 10.3655, "loss/crossentropy": 2.5604530572891235, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.37771423161029816, "step": 712 }, { "epoch": 0.044625, "grad_norm": 4.78125, "grad_norm_var": 1.28101806640625, "learning_rate": 0.0001, "loss": 10.5518, "loss/crossentropy": 2.1526423692703247, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.3977412283420563, "step": 714 }, { "epoch": 0.04475, "grad_norm": 4.5625, "grad_norm_var": 0.3580078125, "learning_rate": 0.0001, "loss": 9.7804, "loss/crossentropy": 2.431856870651245, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3631362318992615, "step": 716 }, { "epoch": 0.044875, "grad_norm": 5.0625, "grad_norm_var": 0.15533447265625, "learning_rate": 0.0001, "loss": 9.9266, "loss/crossentropy": 2.418755054473877, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3390650153160095, "step": 718 }, { "epoch": 0.045, "grad_norm": 4.46875, "grad_norm_var": 0.15953369140625, "learning_rate": 0.0001, "loss": 10.1235, "loss/crossentropy": 2.6371216773986816, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3484189212322235, "step": 720 }, { "epoch": 0.045125, "grad_norm": 4.21875, "grad_norm_var": 0.22204488118489582, "learning_rate": 0.0001, "loss": 9.9546, "loss/crossentropy": 2.2124441862106323, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.34575599431991577, "step": 722 }, { "epoch": 0.04525, "grad_norm": 4.34375, "grad_norm_var": 0.23769429524739583, "learning_rate": 0.0001, "loss": 10.1869, "loss/crossentropy": 2.338310956954956, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.3609844744205475, "step": 724 }, { "epoch": 0.045375, "grad_norm": 4.5, "grad_norm_var": 0.2568511962890625, "learning_rate": 0.0001, "loss": 9.9099, "loss/crossentropy": 2.423145294189453, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3508508801460266, "step": 726 }, { "epoch": 0.0455, "grad_norm": 4.1875, "grad_norm_var": 0.2234771728515625, "learning_rate": 0.0001, "loss": 9.8168, "loss/crossentropy": 2.485997796058655, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.3413703292608261, "step": 728 }, { "epoch": 0.045625, "grad_norm": 4.65625, "grad_norm_var": 0.1412506103515625, "learning_rate": 0.0001, "loss": 10.0025, "loss/crossentropy": 2.5259501934051514, "loss/hidden": 4.0234375, "loss/jsd": 0.0, "loss/logits": 0.3655065894126892, "step": 730 }, { "epoch": 0.04575, "grad_norm": 4.3125, "grad_norm_var": 0.14853413899739584, "learning_rate": 0.0001, "loss": 9.7069, "loss/crossentropy": 2.2198448181152344, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3497984856367111, "step": 732 }, { "epoch": 0.045875, "grad_norm": 4.40625, "grad_norm_var": 0.2511220296223958, "learning_rate": 0.0001, "loss": 10.1372, "loss/crossentropy": 2.53315532207489, "loss/hidden": 4.0703125, "loss/jsd": 0.0, "loss/logits": 0.37490157783031464, "step": 734 }, { "epoch": 0.046, "grad_norm": 4.5, "grad_norm_var": 0.19761454264322917, "learning_rate": 0.0001, "loss": 10.1477, "loss/crossentropy": 2.4126853942871094, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3843526244163513, "step": 736 }, { "epoch": 0.046125, "grad_norm": 4.46875, "grad_norm_var": 0.18131103515625, "learning_rate": 0.0001, "loss": 9.7948, "loss/crossentropy": 2.2977144718170166, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3733298182487488, "step": 738 }, { "epoch": 0.04625, "grad_norm": 4.65625, "grad_norm_var": 0.18857014973958333, "learning_rate": 0.0001, "loss": 9.8328, "loss/crossentropy": 2.246976613998413, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3634299635887146, "step": 740 }, { "epoch": 0.046375, "grad_norm": 4.53125, "grad_norm_var": 0.17237955729166668, "learning_rate": 0.0001, "loss": 10.3349, "loss/crossentropy": 2.6265417337417603, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.3826737552881241, "step": 742 }, { "epoch": 0.0465, "grad_norm": 4.25, "grad_norm_var": 0.174072265625, "learning_rate": 0.0001, "loss": 10.1582, "loss/crossentropy": 2.2192777395248413, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3405959904193878, "step": 744 }, { "epoch": 0.046625, "grad_norm": 4.3125, "grad_norm_var": 0.17711181640625, "learning_rate": 0.0001, "loss": 9.8623, "loss/crossentropy": 2.3501724004745483, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.3643736243247986, "step": 746 }, { "epoch": 0.04675, "grad_norm": 4.1875, "grad_norm_var": 0.17711181640625, "learning_rate": 0.0001, "loss": 9.9924, "loss/crossentropy": 2.3081964254379272, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3664351552724838, "step": 748 }, { "epoch": 0.046875, "grad_norm": 4.53125, "grad_norm_var": 0.0873046875, "learning_rate": 0.0001, "loss": 10.2221, "loss/crossentropy": 2.603135824203491, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.38127802312374115, "step": 750 }, { "epoch": 0.047, "grad_norm": 3.953125, "grad_norm_var": 0.1204986572265625, "learning_rate": 0.0001, "loss": 9.9005, "loss/crossentropy": 2.2832648754119873, "loss/hidden": 3.9921875, "loss/jsd": 0.0, "loss/logits": 0.34063348174095154, "step": 752 }, { "epoch": 0.047125, "grad_norm": 5.71875, "grad_norm_var": 0.2331695556640625, "learning_rate": 0.0001, "loss": 9.785, "loss/crossentropy": 2.2105308771133423, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.34888406097888947, "step": 754 }, { "epoch": 0.04725, "grad_norm": 4.90625, "grad_norm_var": 0.2447662353515625, "learning_rate": 0.0001, "loss": 10.1132, "loss/crossentropy": 2.325472354888916, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.36066247522830963, "step": 756 }, { "epoch": 0.047375, "grad_norm": 4.25, "grad_norm_var": 0.2519683837890625, "learning_rate": 0.0001, "loss": 10.2541, "loss/crossentropy": 2.450170159339905, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.35374966263771057, "step": 758 }, { "epoch": 0.0475, "grad_norm": 4.125, "grad_norm_var": 0.2753082275390625, "learning_rate": 0.0001, "loss": 10.182, "loss/crossentropy": 2.3322771787643433, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3891519159078598, "step": 760 }, { "epoch": 0.047625, "grad_norm": 4.28125, "grad_norm_var": 0.2733062744140625, "learning_rate": 0.0001, "loss": 9.6924, "loss/crossentropy": 2.3392467498779297, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.3408708870410919, "step": 762 }, { "epoch": 0.04775, "grad_norm": 4.875, "grad_norm_var": 0.30399983723958335, "learning_rate": 0.0001, "loss": 9.7753, "loss/crossentropy": 2.443954825401306, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3485272228717804, "step": 764 }, { "epoch": 0.047875, "grad_norm": 4.25, "grad_norm_var": 0.28814697265625, "learning_rate": 0.0001, "loss": 9.8546, "loss/crossentropy": 2.6792436838150024, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.38479599356651306, "step": 766 }, { "epoch": 0.048, "grad_norm": 4.5, "grad_norm_var": 0.2582427978515625, "learning_rate": 0.0001, "loss": 10.0567, "loss/crossentropy": 2.077214241027832, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.4079577922821045, "step": 768 }, { "epoch": 0.048125, "grad_norm": 4.53125, "grad_norm_var": 0.14048563639322917, "learning_rate": 0.0001, "loss": 10.0187, "loss/crossentropy": 2.407822012901306, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.34482939541339874, "step": 770 }, { "epoch": 0.04825, "grad_norm": 4.21875, "grad_norm_var": 0.12446187337239584, "learning_rate": 0.0001, "loss": 9.7492, "loss/crossentropy": 2.391669511795044, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.38050293922424316, "step": 772 }, { "epoch": 0.048375, "grad_norm": 4.46875, "grad_norm_var": 0.1244293212890625, "learning_rate": 0.0001, "loss": 9.9462, "loss/crossentropy": 2.420830249786377, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.37534038722515106, "step": 774 }, { "epoch": 0.0485, "grad_norm": 4.28125, "grad_norm_var": 0.09460347493489583, "learning_rate": 0.0001, "loss": 9.4007, "loss/crossentropy": 2.417847156524658, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.3467719852924347, "step": 776 }, { "epoch": 0.048625, "grad_norm": 4.375, "grad_norm_var": 0.0925933837890625, "learning_rate": 0.0001, "loss": 9.5994, "loss/crossentropy": 2.120936870574951, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3367535322904587, "step": 778 }, { "epoch": 0.04875, "grad_norm": 3.84375, "grad_norm_var": 0.07499593098958333, "learning_rate": 0.0001, "loss": 9.3811, "loss/crossentropy": 2.1081444025039673, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.3248763531446457, "step": 780 }, { "epoch": 0.048875, "grad_norm": 3.859375, "grad_norm_var": 0.08439127604166667, "learning_rate": 0.0001, "loss": 9.4596, "loss/crossentropy": 2.300752282142639, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3305806368589401, "step": 782 }, { "epoch": 0.049, "grad_norm": 5.46875, "grad_norm_var": 51.37913411458333, "learning_rate": 0.0001, "loss": 10.7131, "loss/crossentropy": 2.282582998275757, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.4002307057380676, "step": 784 }, { "epoch": 0.049125, "grad_norm": 4.25, "grad_norm_var": 51.563899739583334, "learning_rate": 0.0001, "loss": 9.8669, "loss/crossentropy": 2.292482376098633, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.3897457867860794, "step": 786 }, { "epoch": 0.04925, "grad_norm": 4.375, "grad_norm_var": 51.68466796875, "learning_rate": 0.0001, "loss": 9.5633, "loss/crossentropy": 2.191980719566345, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.33880043029785156, "step": 788 }, { "epoch": 0.049375, "grad_norm": 4.84375, "grad_norm_var": 51.69274088541667, "learning_rate": 0.0001, "loss": 9.8919, "loss/crossentropy": 2.1574501991271973, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.32353881001472473, "step": 790 }, { "epoch": 0.0495, "grad_norm": 6.5625, "grad_norm_var": 51.19324544270833, "learning_rate": 0.0001, "loss": 10.3413, "loss/crossentropy": 2.2851617336273193, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.3659636974334717, "step": 792 }, { "epoch": 0.049625, "grad_norm": 5.78125, "grad_norm_var": 56.89988606770833, "learning_rate": 0.0001, "loss": 10.4382, "loss/crossentropy": 2.4926841259002686, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.42691296339035034, "step": 794 }, { "epoch": 0.04975, "grad_norm": 4.4375, "grad_norm_var": 56.84940999348958, "learning_rate": 0.0001, "loss": 9.4994, "loss/crossentropy": 2.1696799993515015, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.34047268331050873, "step": 796 }, { "epoch": 0.049875, "grad_norm": 5.125, "grad_norm_var": 56.05621337890625, "learning_rate": 0.0001, "loss": 9.9164, "loss/crossentropy": 2.1714417338371277, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.30278290808200836, "step": 798 }, { "epoch": 0.05, "grad_norm": 4.375, "grad_norm_var": 10.297379557291666, "learning_rate": 0.0001, "loss": 9.4822, "loss/crossentropy": 2.265379786491394, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.34953707456588745, "step": 800 }, { "epoch": 0.050125, "grad_norm": 4.34375, "grad_norm_var": 10.19342041015625, "learning_rate": 0.0001, "loss": 9.6612, "loss/crossentropy": 2.137158155441284, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.31570929288864136, "step": 802 }, { "epoch": 0.05025, "grad_norm": 4.09375, "grad_norm_var": 10.28170166015625, "learning_rate": 0.0001, "loss": 9.9448, "loss/crossentropy": 2.39846932888031, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3465288430452347, "step": 804 }, { "epoch": 0.050375, "grad_norm": 3.984375, "grad_norm_var": 9.948173014322917, "learning_rate": 0.0001, "loss": 9.7494, "loss/crossentropy": 2.5398186445236206, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.3655957281589508, "step": 806 }, { "epoch": 0.0505, "grad_norm": 4.28125, "grad_norm_var": 9.891792805989583, "learning_rate": 0.0001, "loss": 9.8606, "loss/crossentropy": 2.3692381381988525, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.36725762486457825, "step": 808 }, { "epoch": 0.050625, "grad_norm": 4.25, "grad_norm_var": 0.18055013020833333, "learning_rate": 0.0001, "loss": 9.8036, "loss/crossentropy": 2.327956438064575, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.34801676869392395, "step": 810 }, { "epoch": 0.05075, "grad_norm": 4.65625, "grad_norm_var": 0.1889556884765625, "learning_rate": 0.0001, "loss": 9.7749, "loss/crossentropy": 2.406418204307556, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.33478011190891266, "step": 812 }, { "epoch": 0.050875, "grad_norm": 4.25, "grad_norm_var": 0.1968170166015625, "learning_rate": 0.0001, "loss": 9.7763, "loss/crossentropy": 2.111366391181946, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.3220367878675461, "step": 814 }, { "epoch": 0.051, "grad_norm": 4.0625, "grad_norm_var": 0.21132405598958334, "learning_rate": 0.0001, "loss": 9.8965, "loss/crossentropy": 2.525599956512451, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3607639819383621, "step": 816 }, { "epoch": 0.051125, "grad_norm": 4.78125, "grad_norm_var": 0.23552144368489583, "learning_rate": 0.0001, "loss": 9.4786, "loss/crossentropy": 2.189106583595276, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.3183937668800354, "step": 818 }, { "epoch": 0.05125, "grad_norm": 4.71875, "grad_norm_var": 0.22988179524739583, "learning_rate": 0.0001, "loss": 9.8415, "loss/crossentropy": 2.520777702331543, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.3600146919488907, "step": 820 }, { "epoch": 0.051375, "grad_norm": 4.25, "grad_norm_var": 0.21760965983072916, "learning_rate": 0.0001, "loss": 9.8277, "loss/crossentropy": 2.4016454219818115, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.3279205560684204, "step": 822 }, { "epoch": 0.0515, "grad_norm": 4.21875, "grad_norm_var": 0.1434967041015625, "learning_rate": 0.0001, "loss": 9.7368, "loss/crossentropy": 2.133383274078369, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.34343835711479187, "step": 824 }, { "epoch": 0.051625, "grad_norm": 4.625, "grad_norm_var": 0.14257710774739582, "learning_rate": 0.0001, "loss": 9.5228, "loss/crossentropy": 2.483129382133484, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.36225035786628723, "step": 826 }, { "epoch": 0.05175, "grad_norm": 4.84375, "grad_norm_var": 0.14383138020833333, "learning_rate": 0.0001, "loss": 10.3292, "loss/crossentropy": 2.357838988304138, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3369516283273697, "step": 828 }, { "epoch": 0.051875, "grad_norm": 5.15625, "grad_norm_var": 0.13625895182291667, "learning_rate": 0.0001, "loss": 10.2483, "loss/crossentropy": 2.266453266143799, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.3067970871925354, "step": 830 }, { "epoch": 0.052, "grad_norm": 4.375, "grad_norm_var": 0.1204010009765625, "learning_rate": 0.0001, "loss": 9.9158, "loss/crossentropy": 2.6512131690979004, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.36491644382476807, "step": 832 }, { "epoch": 0.052125, "grad_norm": 4.3125, "grad_norm_var": 0.11005757649739584, "learning_rate": 0.0001, "loss": 9.9149, "loss/crossentropy": 2.2720338106155396, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.34162379801273346, "step": 834 }, { "epoch": 0.05225, "grad_norm": 4.46875, "grad_norm_var": 0.10251363118489583, "learning_rate": 0.0001, "loss": 9.4734, "loss/crossentropy": 2.486873984336853, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.39508073031902313, "step": 836 }, { "epoch": 0.052375, "grad_norm": 4.875, "grad_norm_var": 0.1392486572265625, "learning_rate": 0.0001, "loss": 9.9704, "loss/crossentropy": 2.5282520055770874, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.37042422592639923, "step": 838 }, { "epoch": 0.0525, "grad_norm": 4.71875, "grad_norm_var": 0.14732157389322917, "learning_rate": 0.0001, "loss": 9.6631, "loss/crossentropy": 2.2389113903045654, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3424752354621887, "step": 840 }, { "epoch": 0.052625, "grad_norm": 7.5625, "grad_norm_var": 0.7134592692057292, "learning_rate": 0.0001, "loss": 10.3695, "loss/crossentropy": 2.649301052093506, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.4379357695579529, "step": 842 }, { "epoch": 0.05275, "grad_norm": 5.28125, "grad_norm_var": 0.7451487223307292, "learning_rate": 0.0001, "loss": 9.7948, "loss/crossentropy": 2.268893837928772, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3524494171142578, "step": 844 }, { "epoch": 0.052875, "grad_norm": 4.53125, "grad_norm_var": 0.7412017822265625, "learning_rate": 0.0001, "loss": 9.8358, "loss/crossentropy": 2.404254674911499, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.34458406269550323, "step": 846 }, { "epoch": 0.053, "grad_norm": 4.65625, "grad_norm_var": 0.7700480143229167, "learning_rate": 0.0001, "loss": 9.7406, "loss/crossentropy": 2.3409796953201294, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.3395507335662842, "step": 848 }, { "epoch": 0.053125, "grad_norm": 4.5625, "grad_norm_var": 0.9541168212890625, "learning_rate": 0.0001, "loss": 10.0335, "loss/crossentropy": 2.196879267692566, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.32191002368927, "step": 850 }, { "epoch": 0.05325, "grad_norm": 5.0, "grad_norm_var": 0.8870595296223959, "learning_rate": 0.0001, "loss": 10.0438, "loss/crossentropy": 2.388745427131653, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3636998236179352, "step": 852 }, { "epoch": 0.053375, "grad_norm": 4.28125, "grad_norm_var": 0.9299550374348958, "learning_rate": 0.0001, "loss": 9.867, "loss/crossentropy": 2.1942824721336365, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.34746792912483215, "step": 854 }, { "epoch": 0.0535, "grad_norm": 4.71875, "grad_norm_var": 1.1148834228515625, "learning_rate": 0.0001, "loss": 10.0672, "loss/crossentropy": 2.205955982208252, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3741550147533417, "step": 856 }, { "epoch": 0.053625, "grad_norm": 4.53125, "grad_norm_var": 0.7759023030598958, "learning_rate": 0.0001, "loss": 10.099, "loss/crossentropy": 2.351234793663025, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.34030967950820923, "step": 858 }, { "epoch": 0.05375, "grad_norm": 4.8125, "grad_norm_var": 0.8518300374348958, "learning_rate": 0.0001, "loss": 9.9281, "loss/crossentropy": 2.1350356340408325, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3277411162853241, "step": 860 }, { "epoch": 0.053875, "grad_norm": 4.09375, "grad_norm_var": 0.9161041259765625, "learning_rate": 0.0001, "loss": 9.4089, "loss/crossentropy": 2.2630919218063354, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.3764640539884567, "step": 862 }, { "epoch": 0.054, "grad_norm": 4.09375, "grad_norm_var": 0.87818603515625, "learning_rate": 0.0001, "loss": 9.584, "loss/crossentropy": 2.2499537467956543, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3396669030189514, "step": 864 }, { "epoch": 0.054125, "grad_norm": 9.125, "grad_norm_var": 1.8325154622395834, "learning_rate": 0.0001, "loss": 10.1075, "loss/crossentropy": 2.291894793510437, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.5045499503612518, "step": 866 }, { "epoch": 0.05425, "grad_norm": 4.28125, "grad_norm_var": 1.888134765625, "learning_rate": 0.0001, "loss": 9.4826, "loss/crossentropy": 2.1067745685577393, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3029894679784775, "step": 868 }, { "epoch": 0.054375, "grad_norm": 4.40625, "grad_norm_var": 1.9430989583333333, "learning_rate": 0.0001, "loss": 9.8674, "loss/crossentropy": 2.269438624382019, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3538091778755188, "step": 870 }, { "epoch": 0.0545, "grad_norm": 3.71875, "grad_norm_var": 1.67486572265625, "learning_rate": 0.0001, "loss": 9.3252, "loss/crossentropy": 2.486815333366394, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3138183057308197, "step": 872 }, { "epoch": 0.054625, "grad_norm": 4.21875, "grad_norm_var": 1.6998006184895833, "learning_rate": 0.0001, "loss": 9.3781, "loss/crossentropy": 2.1076109409332275, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.2987312972545624, "step": 874 }, { "epoch": 0.05475, "grad_norm": 4.21875, "grad_norm_var": 1.68990478515625, "learning_rate": 0.0001, "loss": 9.5685, "loss/crossentropy": 2.3705573081970215, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.35792042315006256, "step": 876 }, { "epoch": 0.054875, "grad_norm": 3.75, "grad_norm_var": 1.7203776041666667, "learning_rate": 0.0001, "loss": 9.3276, "loss/crossentropy": 2.441094756126404, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3004107028245926, "step": 878 }, { "epoch": 0.055, "grad_norm": 4.0, "grad_norm_var": 1.7296183268229166, "learning_rate": 0.0001, "loss": 9.7451, "loss/crossentropy": 2.16308331489563, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3288375288248062, "step": 880 }, { "epoch": 0.055125, "grad_norm": 4.15625, "grad_norm_var": 0.18866780598958333, "learning_rate": 0.0001, "loss": 9.5159, "loss/crossentropy": 2.05492103099823, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.33372530341148376, "step": 882 }, { "epoch": 0.05525, "grad_norm": 4.375, "grad_norm_var": 0.07303059895833333, "learning_rate": 0.0001, "loss": 10.0029, "loss/crossentropy": 2.4790937900543213, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.3800676763057709, "step": 884 }, { "epoch": 0.055375, "grad_norm": 24.625, "grad_norm_var": 26.282535807291666, "learning_rate": 0.0001, "loss": 9.9205, "loss/crossentropy": 2.47172474861145, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3913479149341583, "step": 886 }, { "epoch": 0.0555, "grad_norm": 6.34375, "grad_norm_var": 36.022379557291664, "learning_rate": 0.0001, "loss": 10.1861, "loss/crossentropy": 2.2905625104904175, "loss/hidden": 3.9921875, "loss/jsd": 0.0, "loss/logits": 0.3679163008928299, "step": 888 }, { "epoch": 0.055625, "grad_norm": 4.5, "grad_norm_var": 35.78157145182292, "learning_rate": 0.0001, "loss": 9.7767, "loss/crossentropy": 2.352526307106018, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3658114969730377, "step": 890 }, { "epoch": 0.05575, "grad_norm": 4.34375, "grad_norm_var": 35.66678059895833, "learning_rate": 0.0001, "loss": 9.5649, "loss/crossentropy": 2.1944379806518555, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.3157523572444916, "step": 892 }, { "epoch": 0.055875, "grad_norm": 4.28125, "grad_norm_var": 35.19581705729167, "learning_rate": 0.0001, "loss": 10.1293, "loss/crossentropy": 2.640023946762085, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.40781745314598083, "step": 894 }, { "epoch": 0.056, "grad_norm": 4.09375, "grad_norm_var": 34.96490478515625, "learning_rate": 0.0001, "loss": 9.8729, "loss/crossentropy": 2.2978581190109253, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.34524331986904144, "step": 896 }, { "epoch": 0.056125, "grad_norm": 4.21875, "grad_norm_var": 35.01881510416667, "learning_rate": 0.0001, "loss": 9.4132, "loss/crossentropy": 2.206250786781311, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.31589607894420624, "step": 898 }, { "epoch": 0.05625, "grad_norm": 3.78125, "grad_norm_var": 35.51417643229167, "learning_rate": 0.0001, "loss": 9.175, "loss/crossentropy": 2.314005136489868, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3205699771642685, "step": 900 }, { "epoch": 0.056375, "grad_norm": 3.90625, "grad_norm_var": 13.002958170572917, "learning_rate": 0.0001, "loss": 9.404, "loss/crossentropy": 2.279939651489258, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.30738452076911926, "step": 902 }, { "epoch": 0.0565, "grad_norm": 4.125, "grad_norm_var": 0.19872639973958334, "learning_rate": 0.0001, "loss": 9.9184, "loss/crossentropy": 2.1859201192855835, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.33272892236709595, "step": 904 }, { "epoch": 0.056625, "grad_norm": 5.03125, "grad_norm_var": 13.173075358072916, "learning_rate": 0.0001, "loss": 10.0596, "loss/crossentropy": 2.372039318084717, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3175077885389328, "step": 906 }, { "epoch": 0.05675, "grad_norm": 4.28125, "grad_norm_var": 13.12115478515625, "learning_rate": 0.0001, "loss": 9.8515, "loss/crossentropy": 2.368989109992981, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.38050130009651184, "step": 908 }, { "epoch": 0.056875, "grad_norm": 4.3125, "grad_norm_var": 13.271987915039062, "learning_rate": 0.0001, "loss": 9.4947, "loss/crossentropy": 2.3800413608551025, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.36185871064662933, "step": 910 }, { "epoch": 0.057, "grad_norm": 4.09375, "grad_norm_var": 13.295059204101562, "learning_rate": 0.0001, "loss": 9.2474, "loss/crossentropy": 2.1740427017211914, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.29893849790096283, "step": 912 }, { "epoch": 0.057125, "grad_norm": 4.28125, "grad_norm_var": 13.314383951822917, "learning_rate": 0.0001, "loss": 9.5688, "loss/crossentropy": 2.1147167086601257, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3557524085044861, "step": 914 }, { "epoch": 0.05725, "grad_norm": 3.5625, "grad_norm_var": 13.265576171875, "learning_rate": 0.0001, "loss": 9.2446, "loss/crossentropy": 2.0734463930130005, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.34740157425403595, "step": 916 }, { "epoch": 0.057375, "grad_norm": 4.34375, "grad_norm_var": 13.104325358072916, "learning_rate": 0.0001, "loss": 9.7651, "loss/crossentropy": 2.3448511362075806, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.3264952749013901, "step": 918 }, { "epoch": 0.0575, "grad_norm": 3.921875, "grad_norm_var": 13.216502888997395, "learning_rate": 0.0001, "loss": 9.5242, "loss/crossentropy": 2.3341474533081055, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3225775808095932, "step": 920 }, { "epoch": 0.057625, "grad_norm": 4.59375, "grad_norm_var": 0.2557281494140625, "learning_rate": 0.0001, "loss": 9.7221, "loss/crossentropy": 2.4475208520889282, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.36622609198093414, "step": 922 }, { "epoch": 0.05775, "grad_norm": 4.21875, "grad_norm_var": 0.11204427083333333, "learning_rate": 0.0001, "loss": 9.5311, "loss/crossentropy": 2.2603464126586914, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.3138343095779419, "step": 924 }, { "epoch": 0.057875, "grad_norm": 4.4375, "grad_norm_var": 0.1020172119140625, "learning_rate": 0.0001, "loss": 9.7883, "loss/crossentropy": 2.2502031326293945, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3444705605506897, "step": 926 }, { "epoch": 0.058, "grad_norm": 4.625, "grad_norm_var": 0.11142171223958333, "learning_rate": 0.0001, "loss": 9.5446, "loss/crossentropy": 2.256578207015991, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3087367117404938, "step": 928 }, { "epoch": 0.058125, "grad_norm": 3.75, "grad_norm_var": 0.12112528483072917, "learning_rate": 0.0001, "loss": 9.5936, "loss/crossentropy": 2.4345552921295166, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.33862273395061493, "step": 930 }, { "epoch": 0.05825, "grad_norm": 4.375, "grad_norm_var": 0.6636057535807292, "learning_rate": 0.0001, "loss": 10.1023, "loss/crossentropy": 2.229047179222107, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.32297711074352264, "step": 932 }, { "epoch": 0.058375, "grad_norm": 4.3125, "grad_norm_var": 0.6833648681640625, "learning_rate": 0.0001, "loss": 9.8382, "loss/crossentropy": 2.6197463274002075, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.394945427775383, "step": 934 }, { "epoch": 0.0585, "grad_norm": 3.734375, "grad_norm_var": 0.6986724853515625, "learning_rate": 0.0001, "loss": 9.5592, "loss/crossentropy": 2.216399669647217, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.34535615146160126, "step": 936 }, { "epoch": 0.058625, "grad_norm": 4.4375, "grad_norm_var": 0.6882558186848958, "learning_rate": 0.0001, "loss": 9.5469, "loss/crossentropy": 2.6119513511657715, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.34995993971824646, "step": 938 }, { "epoch": 0.05875, "grad_norm": 4.5625, "grad_norm_var": 0.74136962890625, "learning_rate": 0.0001, "loss": 9.5875, "loss/crossentropy": 2.2213977575302124, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.29343417286872864, "step": 940 }, { "epoch": 0.058875, "grad_norm": 6.625, "grad_norm_var": 0.992724609375, "learning_rate": 0.0001, "loss": 9.9021, "loss/crossentropy": 2.5338159799575806, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3394032418727875, "step": 942 }, { "epoch": 0.059, "grad_norm": 4.34375, "grad_norm_var": 0.9622467041015625, "learning_rate": 0.0001, "loss": 9.5385, "loss/crossentropy": 2.337091326713562, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3653382658958435, "step": 944 }, { "epoch": 0.059125, "grad_norm": 4.21875, "grad_norm_var": 0.9037343343098958, "learning_rate": 0.0001, "loss": 9.6615, "loss/crossentropy": 2.3409534692764282, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3641035109758377, "step": 946 }, { "epoch": 0.05925, "grad_norm": 5.0, "grad_norm_var": 0.4998118082682292, "learning_rate": 0.0001, "loss": 9.5936, "loss/crossentropy": 2.311855435371399, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.43438956141471863, "step": 948 }, { "epoch": 0.059375, "grad_norm": 4.65625, "grad_norm_var": 0.4883371988932292, "learning_rate": 0.0001, "loss": 9.3192, "loss/crossentropy": 2.1787959337234497, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3181719183921814, "step": 950 }, { "epoch": 0.0595, "grad_norm": 4.28125, "grad_norm_var": 0.43896077473958334, "learning_rate": 0.0001, "loss": 9.6047, "loss/crossentropy": 2.2675414085388184, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.33463895320892334, "step": 952 }, { "epoch": 0.059625, "grad_norm": 4.15625, "grad_norm_var": 0.4328125, "learning_rate": 0.0001, "loss": 9.6131, "loss/crossentropy": 2.3636629581451416, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.37111011147499084, "step": 954 }, { "epoch": 0.05975, "grad_norm": 4.25, "grad_norm_var": 0.38201497395833334, "learning_rate": 0.0001, "loss": 9.3829, "loss/crossentropy": 2.1130423545837402, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.32567116618156433, "step": 956 }, { "epoch": 0.059875, "grad_norm": 4.21875, "grad_norm_var": 0.0728515625, "learning_rate": 0.0001, "loss": 9.262, "loss/crossentropy": 2.051333010196686, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.30729782581329346, "step": 958 }, { "epoch": 0.06, "grad_norm": 4.03125, "grad_norm_var": 0.06295572916666667, "learning_rate": 0.0001, "loss": 9.611, "loss/crossentropy": 2.140830159187317, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3241441547870636, "step": 960 }, { "epoch": 0.060125, "grad_norm": 3.84375, "grad_norm_var": 0.0759765625, "learning_rate": 0.0001, "loss": 9.3228, "loss/crossentropy": 2.1913615465164185, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3273606151342392, "step": 962 }, { "epoch": 0.06025, "grad_norm": 4.09375, "grad_norm_var": 0.0519683837890625, "learning_rate": 0.0001, "loss": 9.4052, "loss/crossentropy": 1.9163798093795776, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.3350640535354614, "step": 964 }, { "epoch": 0.060375, "grad_norm": 4.46875, "grad_norm_var": 0.0576568603515625, "learning_rate": 0.0001, "loss": 9.8501, "loss/crossentropy": 2.414613366127014, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.32891781628131866, "step": 966 }, { "epoch": 0.0605, "grad_norm": 4.46875, "grad_norm_var": 0.06399637858072917, "learning_rate": 0.0001, "loss": 9.3503, "loss/crossentropy": 2.297875761985779, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.3497025966644287, "step": 968 }, { "epoch": 0.060625, "grad_norm": 4.0, "grad_norm_var": 0.3565338134765625, "learning_rate": 0.0001, "loss": 9.3331, "loss/crossentropy": 2.2438501119613647, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.31031179428100586, "step": 970 }, { "epoch": 0.06075, "grad_norm": 4.1875, "grad_norm_var": 0.3681711832682292, "learning_rate": 0.0001, "loss": 9.5492, "loss/crossentropy": 2.3227267265319824, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3225504010915756, "step": 972 }, { "epoch": 0.060875, "grad_norm": 4.09375, "grad_norm_var": 0.37629292805989584, "learning_rate": 0.0001, "loss": 9.313, "loss/crossentropy": 1.9936909079551697, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.3204316198825836, "step": 974 }, { "epoch": 0.061, "grad_norm": 3.890625, "grad_norm_var": 0.39006245930989586, "learning_rate": 0.0001, "loss": 9.3335, "loss/crossentropy": 2.176134943962097, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3095164895057678, "step": 976 }, { "epoch": 0.061125, "grad_norm": 4.25, "grad_norm_var": 0.3955149332682292, "learning_rate": 0.0001, "loss": 9.4377, "loss/crossentropy": 2.2390334606170654, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.34077727794647217, "step": 978 }, { "epoch": 0.06125, "grad_norm": 4.40625, "grad_norm_var": 0.37867431640625, "learning_rate": 0.0001, "loss": 9.322, "loss/crossentropy": 2.320459246635437, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3498273342847824, "step": 980 }, { "epoch": 0.061375, "grad_norm": 4.8125, "grad_norm_var": 0.7111490885416667, "learning_rate": 0.0001, "loss": 9.6542, "loss/crossentropy": 2.1645848751068115, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.3087887167930603, "step": 982 }, { "epoch": 0.0615, "grad_norm": 3.828125, "grad_norm_var": 0.7261138916015625, "learning_rate": 0.0001, "loss": 9.2565, "loss/crossentropy": 2.195105791091919, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.31541192531585693, "step": 984 }, { "epoch": 0.061625, "grad_norm": 4.40625, "grad_norm_var": 0.4574045817057292, "learning_rate": 0.0001, "loss": 9.5938, "loss/crossentropy": 2.5614267587661743, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.33089199662208557, "step": 986 }, { "epoch": 0.06175, "grad_norm": 3.828125, "grad_norm_var": 0.4867828369140625, "learning_rate": 0.0001, "loss": 9.4029, "loss/crossentropy": 2.3294313549995422, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3205059766769409, "step": 988 }, { "epoch": 0.061875, "grad_norm": 4.5625, "grad_norm_var": 0.4893870035807292, "learning_rate": 0.0001, "loss": 9.7033, "loss/crossentropy": 1.9489195942878723, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.339836522936821, "step": 990 }, { "epoch": 0.062, "grad_norm": 4.40625, "grad_norm_var": 0.4716135660807292, "learning_rate": 0.0001, "loss": 9.4979, "loss/crossentropy": 2.524012327194214, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.3404659479856491, "step": 992 }, { "epoch": 0.062125, "grad_norm": 4.375, "grad_norm_var": 0.45547587076822915, "learning_rate": 0.0001, "loss": 9.2762, "loss/crossentropy": 2.2983919382095337, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3214043080806732, "step": 994 }, { "epoch": 0.06225, "grad_norm": 4.625, "grad_norm_var": 1.3710927327473958, "learning_rate": 0.0001, "loss": 9.6949, "loss/crossentropy": 1.94156152009964, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.3371322900056839, "step": 996 }, { "epoch": 0.062375, "grad_norm": 3.921875, "grad_norm_var": 1.1604817708333333, "learning_rate": 0.0001, "loss": 9.3358, "loss/crossentropy": 2.2170941829681396, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.3620257079601288, "step": 998 }, { "epoch": 0.0625, "grad_norm": 4.03125, "grad_norm_var": 1.165363566080729, "learning_rate": 0.0001, "loss": 9.2476, "loss/crossentropy": 2.271137237548828, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.3439271152019501, "step": 1000 }, { "epoch": 0.062625, "grad_norm": 3.84375, "grad_norm_var": 1.159919230143229, "learning_rate": 0.0001, "loss": 9.3267, "loss/crossentropy": 1.887793481349945, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.2793608605861664, "step": 1002 }, { "epoch": 0.06275, "grad_norm": 3.75, "grad_norm_var": 1.127936808268229, "learning_rate": 0.0001, "loss": 9.6683, "loss/crossentropy": 2.546959638595581, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3406732678413391, "step": 1004 }, { "epoch": 0.062875, "grad_norm": 4.0, "grad_norm_var": 1.171027628580729, "learning_rate": 0.0001, "loss": 9.24, "loss/crossentropy": 2.0239084362983704, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.2986813932657242, "step": 1006 }, { "epoch": 0.063, "grad_norm": 5.34375, "grad_norm_var": 1.2344065348307292, "learning_rate": 0.0001, "loss": 9.439, "loss/crossentropy": 2.4122122526168823, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.33327388763427734, "step": 1008 }, { "epoch": 0.063125, "grad_norm": 3.90625, "grad_norm_var": 1.2634440104166667, "learning_rate": 0.0001, "loss": 9.1892, "loss/crossentropy": 2.514481782913208, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.324728786945343, "step": 1010 }, { "epoch": 0.06325, "grad_norm": 3.8125, "grad_norm_var": 0.2579752604166667, "learning_rate": 0.0001, "loss": 9.0485, "loss/crossentropy": 2.093431532382965, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.2737080454826355, "step": 1012 }, { "epoch": 0.063375, "grad_norm": 3.984375, "grad_norm_var": 0.17375895182291667, "learning_rate": 0.0001, "loss": 9.3501, "loss/crossentropy": 2.4039806127548218, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3349785953760147, "step": 1014 }, { "epoch": 0.0635, "grad_norm": 4.125, "grad_norm_var": 0.16858317057291666, "learning_rate": 0.0001, "loss": 9.4553, "loss/crossentropy": 2.2511470317840576, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3223019689321518, "step": 1016 }, { "epoch": 0.063625, "grad_norm": 4.625, "grad_norm_var": 0.23199869791666666, "learning_rate": 0.0001, "loss": 9.4476, "loss/crossentropy": 2.3230113983154297, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.33511024713516235, "step": 1018 }, { "epoch": 0.06375, "grad_norm": 5.21875, "grad_norm_var": 0.3421834309895833, "learning_rate": 0.0001, "loss": 9.3437, "loss/crossentropy": 2.1574745178222656, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.34460465610027313, "step": 1020 }, { "epoch": 0.063875, "grad_norm": 4.15625, "grad_norm_var": 0.34429931640625, "learning_rate": 0.0001, "loss": 9.5896, "loss/crossentropy": 2.325391948223114, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3762832581996918, "step": 1022 }, { "epoch": 0.064, "grad_norm": 3.75, "grad_norm_var": 0.32193603515625, "learning_rate": 0.0001, "loss": 9.2142, "loss/crossentropy": 1.9802694916725159, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.30826494097709656, "step": 1024 }, { "epoch": 0.064125, "grad_norm": 3.9375, "grad_norm_var": 0.3210601806640625, "learning_rate": 0.0001, "loss": 9.4929, "loss/crossentropy": 2.2971783876419067, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3391452133655548, "step": 1026 }, { "epoch": 0.06425, "grad_norm": 3.765625, "grad_norm_var": 0.32535400390625, "learning_rate": 0.0001, "loss": 9.3575, "loss/crossentropy": 2.0553237199783325, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.30592888593673706, "step": 1028 }, { "epoch": 0.064375, "grad_norm": 4.53125, "grad_norm_var": 0.4266021728515625, "learning_rate": 0.0001, "loss": 10.0171, "loss/crossentropy": 2.124045729637146, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.4383738338947296, "step": 1030 }, { "epoch": 0.0645, "grad_norm": 5.125, "grad_norm_var": 0.4291005452473958, "learning_rate": 0.0001, "loss": 9.6314, "loss/crossentropy": 2.1157106161117554, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.30019576847553253, "step": 1032 }, { "epoch": 0.064625, "grad_norm": 4.15625, "grad_norm_var": 0.44502665201822916, "learning_rate": 0.0001, "loss": 9.439, "loss/crossentropy": 2.0622119903564453, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.32624077796936035, "step": 1034 }, { "epoch": 0.06475, "grad_norm": 4.84375, "grad_norm_var": 0.37735087076822915, "learning_rate": 0.0001, "loss": 9.5106, "loss/crossentropy": 2.134315252304077, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.353405624628067, "step": 1036 }, { "epoch": 0.064875, "grad_norm": 3.796875, "grad_norm_var": 0.3602498372395833, "learning_rate": 0.0001, "loss": 9.5207, "loss/crossentropy": 2.1483497619628906, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3199582099914551, "step": 1038 }, { "epoch": 0.065, "grad_norm": 4.3125, "grad_norm_var": 0.32724202473958336, "learning_rate": 0.0001, "loss": 9.6394, "loss/crossentropy": 2.3073023557662964, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.3543713688850403, "step": 1040 }, { "epoch": 0.065125, "grad_norm": 3.75, "grad_norm_var": 0.35016988118489584, "learning_rate": 0.0001, "loss": 9.3647, "loss/crossentropy": 2.2450332641601562, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3619850426912308, "step": 1042 }, { "epoch": 0.06525, "grad_norm": 3.84375, "grad_norm_var": 0.35196024576822915, "learning_rate": 0.0001, "loss": 9.4196, "loss/crossentropy": 2.2498934268951416, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.347659170627594, "step": 1044 }, { "epoch": 0.065375, "grad_norm": 4.0625, "grad_norm_var": 0.20666402180989582, "learning_rate": 0.0001, "loss": 9.2666, "loss/crossentropy": 2.587206482887268, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3400786221027374, "step": 1046 }, { "epoch": 0.0655, "grad_norm": 3.578125, "grad_norm_var": 0.15130208333333334, "learning_rate": 0.0001, "loss": 9.2092, "loss/crossentropy": 2.194170832633972, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.32428255677223206, "step": 1048 }, { "epoch": 0.065625, "grad_norm": 4.34375, "grad_norm_var": 0.14869384765625, "learning_rate": 0.0001, "loss": 9.4251, "loss/crossentropy": 2.1931979656219482, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3415902405977249, "step": 1050 }, { "epoch": 0.06575, "grad_norm": 3.78125, "grad_norm_var": 0.0619140625, "learning_rate": 0.0001, "loss": 9.1983, "loss/crossentropy": 2.3534958362579346, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.34609000384807587, "step": 1052 }, { "epoch": 0.065875, "grad_norm": 4.0625, "grad_norm_var": 0.0712890625, "learning_rate": 0.0001, "loss": 9.0208, "loss/crossentropy": 2.121673583984375, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3000381886959076, "step": 1054 }, { "epoch": 0.066, "grad_norm": 3.859375, "grad_norm_var": 0.06524149576822917, "learning_rate": 0.0001, "loss": 9.202, "loss/crossentropy": 2.418868660926819, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.32929007709026337, "step": 1056 }, { "epoch": 0.066125, "grad_norm": 4.5, "grad_norm_var": 0.07296549479166667, "learning_rate": 0.0001, "loss": 9.4209, "loss/crossentropy": 2.3581680059432983, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.39325736463069916, "step": 1058 }, { "epoch": 0.06625, "grad_norm": 3.796875, "grad_norm_var": 0.08592020670572917, "learning_rate": 0.0001, "loss": 9.0289, "loss/crossentropy": 2.069228768348694, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.311954140663147, "step": 1060 }, { "epoch": 0.066375, "grad_norm": 3.796875, "grad_norm_var": 0.15517171223958334, "learning_rate": 0.0001, "loss": 9.6988, "loss/crossentropy": 2.4181917905807495, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.40750065445899963, "step": 1062 }, { "epoch": 0.0665, "grad_norm": 4.28125, "grad_norm_var": 0.14216206868489584, "learning_rate": 0.0001, "loss": 9.5099, "loss/crossentropy": 2.424588680267334, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3340979814529419, "step": 1064 }, { "epoch": 0.066625, "grad_norm": 3.625, "grad_norm_var": 0.15505269368489583, "learning_rate": 0.0001, "loss": 9.1677, "loss/crossentropy": 2.2103652954101562, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3072456121444702, "step": 1066 }, { "epoch": 0.06675, "grad_norm": 3.96875, "grad_norm_var": 0.14728902180989584, "learning_rate": 0.0001, "loss": 9.2546, "loss/crossentropy": 2.4093486070632935, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3384798616170883, "step": 1068 }, { "epoch": 0.066875, "grad_norm": 4.40625, "grad_norm_var": 0.14303385416666667, "learning_rate": 0.0001, "loss": 9.3364, "loss/crossentropy": 2.1890220642089844, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.2857840359210968, "step": 1070 }, { "epoch": 0.067, "grad_norm": 5.25, "grad_norm_var": 0.2286041259765625, "learning_rate": 0.0001, "loss": 9.5227, "loss/crossentropy": 2.253679633140564, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.36102280020713806, "step": 1072 }, { "epoch": 0.067125, "grad_norm": 4.0, "grad_norm_var": 0.24812723795572916, "learning_rate": 0.0001, "loss": 9.4893, "loss/crossentropy": 2.292818784713745, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3276713937520981, "step": 1074 }, { "epoch": 0.06725, "grad_norm": 6.34375, "grad_norm_var": 0.9611724853515625, "learning_rate": 0.0001, "loss": 9.85, "loss/crossentropy": 2.514404058456421, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.34391574561595917, "step": 1076 }, { "epoch": 0.067375, "grad_norm": 4.25, "grad_norm_var": 0.9194620768229167, "learning_rate": 0.0001, "loss": 9.5348, "loss/crossentropy": 2.343267798423767, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3307941257953644, "step": 1078 }, { "epoch": 0.0675, "grad_norm": 4.25, "grad_norm_var": 0.9223307291666667, "learning_rate": 0.0001, "loss": 9.5136, "loss/crossentropy": 2.442410469055176, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3149748295545578, "step": 1080 }, { "epoch": 0.067625, "grad_norm": 4.125, "grad_norm_var": 0.8810831705729166, "learning_rate": 0.0001, "loss": 9.4811, "loss/crossentropy": 2.2090275287628174, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.30319978296756744, "step": 1082 }, { "epoch": 0.06775, "grad_norm": 3.875, "grad_norm_var": 0.8578776041666667, "learning_rate": 0.0001, "loss": 9.4955, "loss/crossentropy": 2.3490201830863953, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3528668284416199, "step": 1084 }, { "epoch": 0.067875, "grad_norm": 3.796875, "grad_norm_var": 0.9058258056640625, "learning_rate": 0.0001, "loss": 9.2706, "loss/crossentropy": 2.276387929916382, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.3269159346818924, "step": 1086 }, { "epoch": 0.068, "grad_norm": 4.0, "grad_norm_var": 0.8852366129557292, "learning_rate": 0.0001, "loss": 9.2629, "loss/crossentropy": 2.3478105068206787, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.2995656430721283, "step": 1088 }, { "epoch": 0.068125, "grad_norm": 4.84375, "grad_norm_var": 0.8806467692057292, "learning_rate": 0.0001, "loss": 9.7304, "loss/crossentropy": 2.3621827363967896, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.4546060711145401, "step": 1090 }, { "epoch": 0.06825, "grad_norm": 3.8125, "grad_norm_var": 0.07867838541666666, "learning_rate": 0.0001, "loss": 9.2301, "loss/crossentropy": 2.3968265056610107, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3126474618911743, "step": 1092 }, { "epoch": 0.068375, "grad_norm": 3.9375, "grad_norm_var": 0.10178629557291667, "learning_rate": 0.0001, "loss": 9.1654, "loss/crossentropy": 2.1036200523376465, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.32715390622615814, "step": 1094 }, { "epoch": 0.0685, "grad_norm": 4.3125, "grad_norm_var": 0.11669514973958334, "learning_rate": 0.0001, "loss": 9.373, "loss/crossentropy": 2.3250420093536377, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.33760012686252594, "step": 1096 }, { "epoch": 0.068625, "grad_norm": 3.984375, "grad_norm_var": 0.1229644775390625, "learning_rate": 0.0001, "loss": 9.4241, "loss/crossentropy": 2.2138954401016235, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3191235810518265, "step": 1098 }, { "epoch": 0.06875, "grad_norm": 4.1875, "grad_norm_var": 0.1237213134765625, "learning_rate": 0.0001, "loss": 9.2826, "loss/crossentropy": 2.422287702560425, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.339611291885376, "step": 1100 }, { "epoch": 0.068875, "grad_norm": 3.71875, "grad_norm_var": 0.12141520182291667, "learning_rate": 0.0001, "loss": 9.3549, "loss/crossentropy": 2.3542439937591553, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.33625495433807373, "step": 1102 }, { "epoch": 0.069, "grad_norm": 6.125, "grad_norm_var": 0.3832183837890625, "learning_rate": 0.0001, "loss": 9.1507, "loss/crossentropy": 2.123876988887787, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3458560109138489, "step": 1104 }, { "epoch": 0.069125, "grad_norm": 4.1875, "grad_norm_var": 0.36404520670572915, "learning_rate": 0.0001, "loss": 9.1443, "loss/crossentropy": 2.043139398097992, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3290232867002487, "step": 1106 }, { "epoch": 0.06925, "grad_norm": 6.28125, "grad_norm_var": 0.67164306640625, "learning_rate": 0.0001, "loss": 8.9936, "loss/crossentropy": 2.1914546489715576, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3750714063644409, "step": 1108 }, { "epoch": 0.069375, "grad_norm": 3.984375, "grad_norm_var": 0.7087392171223958, "learning_rate": 0.0001, "loss": 9.1233, "loss/crossentropy": 2.3304919004440308, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3122476786375046, "step": 1110 }, { "epoch": 0.0695, "grad_norm": 4.1875, "grad_norm_var": 0.7235260009765625, "learning_rate": 0.0001, "loss": 9.2897, "loss/crossentropy": 2.128778100013733, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.29984790086746216, "step": 1112 }, { "epoch": 0.069625, "grad_norm": 9.375, "grad_norm_var": 2.24293212890625, "learning_rate": 0.0001, "loss": 9.3617, "loss/crossentropy": 2.085534453392029, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.3328994959592819, "step": 1114 }, { "epoch": 0.06975, "grad_norm": 4.21875, "grad_norm_var": 2.19111328125, "learning_rate": 0.0001, "loss": 9.2183, "loss/crossentropy": 2.293601393699646, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3058394640684128, "step": 1116 }, { "epoch": 0.069875, "grad_norm": 5.5625, "grad_norm_var": 2.2120351155598956, "learning_rate": 0.0001, "loss": 9.4662, "loss/crossentropy": 2.4517805576324463, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3616568148136139, "step": 1118 }, { "epoch": 0.07, "grad_norm": 4.46875, "grad_norm_var": 2.064404296875, "learning_rate": 0.0001, "loss": 9.2767, "loss/crossentropy": 2.0485028624534607, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.32380858063697815, "step": 1120 }, { "epoch": 0.070125, "grad_norm": 3.359375, "grad_norm_var": 2.187425740559896, "learning_rate": 0.0001, "loss": 9.3006, "loss/crossentropy": 2.361703395843506, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3274669200181961, "step": 1122 }, { "epoch": 0.07025, "grad_norm": 4.1875, "grad_norm_var": 1.9487050374348958, "learning_rate": 0.0001, "loss": 9.7995, "loss/crossentropy": 2.339933753013611, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3549206554889679, "step": 1124 }, { "epoch": 0.070375, "grad_norm": 5.3125, "grad_norm_var": 1.8818318684895834, "learning_rate": 0.0001, "loss": 9.6824, "loss/crossentropy": 2.4020055532455444, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.35619041323661804, "step": 1126 }, { "epoch": 0.0705, "grad_norm": 5.15625, "grad_norm_var": 4.245003255208333, "learning_rate": 0.0001, "loss": 9.2954, "loss/crossentropy": 2.028614342212677, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.2933530956506729, "step": 1128 }, { "epoch": 0.070625, "grad_norm": 11.25, "grad_norm_var": 9.55810546875, "learning_rate": 0.0001, "loss": 10.2405, "loss/crossentropy": 2.6706130504608154, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.5690062493085861, "step": 1130 }, { "epoch": 0.07075, "grad_norm": 4.125, "grad_norm_var": 9.512593587239584, "learning_rate": 0.0001, "loss": 9.505, "loss/crossentropy": 2.216508388519287, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.34619633853435516, "step": 1132 }, { "epoch": 0.070875, "grad_norm": 3.78125, "grad_norm_var": 9.696572875976562, "learning_rate": 0.0001, "loss": 9.499, "loss/crossentropy": 2.484335422515869, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.32301226258277893, "step": 1134 }, { "epoch": 0.071, "grad_norm": 3.578125, "grad_norm_var": 10.04234619140625, "learning_rate": 0.0001, "loss": 9.1052, "loss/crossentropy": 2.148248791694641, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3175772875547409, "step": 1136 }, { "epoch": 0.071125, "grad_norm": 4.09375, "grad_norm_var": 9.868179321289062, "learning_rate": 0.0001, "loss": 9.5094, "loss/crossentropy": 2.3416961431503296, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3289973586797714, "step": 1138 }, { "epoch": 0.07125, "grad_norm": 4.25, "grad_norm_var": 9.610399373372395, "learning_rate": 0.0001, "loss": 9.5606, "loss/crossentropy": 2.421358108520508, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.33253444731235504, "step": 1140 }, { "epoch": 0.071375, "grad_norm": 4.40625, "grad_norm_var": 9.937515258789062, "learning_rate": 0.0001, "loss": 9.2288, "loss/crossentropy": 2.1072720289230347, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.2903750240802765, "step": 1142 }, { "epoch": 0.0715, "grad_norm": 3.625, "grad_norm_var": 8.450804646809896, "learning_rate": 0.0001, "loss": 9.102, "loss/crossentropy": 2.354582667350769, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.27638527750968933, "step": 1144 }, { "epoch": 0.071625, "grad_norm": 4.25, "grad_norm_var": 0.7305084228515625, "learning_rate": 0.0001, "loss": 9.31, "loss/crossentropy": 2.280918002128601, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.33110353350639343, "step": 1146 }, { "epoch": 0.07175, "grad_norm": 3.828125, "grad_norm_var": 0.71959228515625, "learning_rate": 0.0001, "loss": 9.1223, "loss/crossentropy": 2.5760377645492554, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3157148212194443, "step": 1148 }, { "epoch": 0.071875, "grad_norm": 4.4375, "grad_norm_var": 1.3645345052083333, "learning_rate": 0.0001, "loss": 9.665, "loss/crossentropy": 2.343551754951477, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.32151465117931366, "step": 1150 }, { "epoch": 0.072, "grad_norm": 4.1875, "grad_norm_var": 1.2975901285807292, "learning_rate": 0.0001, "loss": 9.2148, "loss/crossentropy": 2.241070032119751, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.30810578167438507, "step": 1152 }, { "epoch": 0.072125, "grad_norm": 3.78125, "grad_norm_var": 0.9642079671223959, "learning_rate": 0.0001, "loss": 8.8935, "loss/crossentropy": 2.067093253135681, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.289178729057312, "step": 1154 }, { "epoch": 0.07225, "grad_norm": 3.765625, "grad_norm_var": 0.8681966145833333, "learning_rate": 0.0001, "loss": 9.4054, "loss/crossentropy": 2.6034939289093018, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.3149688243865967, "step": 1156 }, { "epoch": 0.072375, "grad_norm": 3.609375, "grad_norm_var": 0.8993886311848959, "learning_rate": 0.0001, "loss": 9.0434, "loss/crossentropy": 2.411675810813904, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3017608970403671, "step": 1158 }, { "epoch": 0.0725, "grad_norm": 4.375, "grad_norm_var": 0.8865142822265625, "learning_rate": 0.0001, "loss": 9.462, "loss/crossentropy": 2.448275327682495, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3322191536426544, "step": 1160 }, { "epoch": 0.072625, "grad_norm": 4.28125, "grad_norm_var": 0.9890126546223958, "learning_rate": 0.0001, "loss": 9.0953, "loss/crossentropy": 2.0147193670272827, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3072509914636612, "step": 1162 }, { "epoch": 0.07275, "grad_norm": 3.5625, "grad_norm_var": 0.9968821207682291, "learning_rate": 0.0001, "loss": 9.0557, "loss/crossentropy": 2.0512733459472656, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.30344827473163605, "step": 1164 }, { "epoch": 0.072875, "grad_norm": 4.0625, "grad_norm_var": 0.2585245768229167, "learning_rate": 0.0001, "loss": 9.184, "loss/crossentropy": 2.2927803993225098, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.2943641096353531, "step": 1166 }, { "epoch": 0.073, "grad_norm": 5.0, "grad_norm_var": 6.699788411458333, "learning_rate": 0.0001, "loss": 9.6349, "loss/crossentropy": 2.1713006496429443, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3370360732078552, "step": 1168 }, { "epoch": 0.073125, "grad_norm": 3.953125, "grad_norm_var": 6.6875966389973955, "learning_rate": 0.0001, "loss": 9.4196, "loss/crossentropy": 2.474995255470276, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.38732604682445526, "step": 1170 }, { "epoch": 0.07325, "grad_norm": 4.03125, "grad_norm_var": 6.662890625, "learning_rate": 0.0001, "loss": 9.2525, "loss/crossentropy": 2.072790205478668, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.29025524854660034, "step": 1172 }, { "epoch": 0.073375, "grad_norm": 4.15625, "grad_norm_var": 6.500926717122396, "learning_rate": 0.0001, "loss": 9.3795, "loss/crossentropy": 2.099343776702881, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3865123689174652, "step": 1174 }, { "epoch": 0.0735, "grad_norm": 4.78125, "grad_norm_var": 6.411652628580729, "learning_rate": 0.0001, "loss": 9.3436, "loss/crossentropy": 2.4349101781845093, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.37168101966381073, "step": 1176 }, { "epoch": 0.073625, "grad_norm": 3.671875, "grad_norm_var": 6.4862620035807295, "learning_rate": 0.0001, "loss": 9.0767, "loss/crossentropy": 2.016998767852783, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.27959877252578735, "step": 1178 }, { "epoch": 0.07375, "grad_norm": 4.3125, "grad_norm_var": 6.372587076822916, "learning_rate": 0.0001, "loss": 9.2147, "loss/crossentropy": 2.506001353263855, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3655714690685272, "step": 1180 }, { "epoch": 0.073875, "grad_norm": 4.25, "grad_norm_var": 6.386311848958333, "learning_rate": 0.0001, "loss": 9.2234, "loss/crossentropy": 2.176361560821533, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3110087811946869, "step": 1182 }, { "epoch": 0.074, "grad_norm": 3.640625, "grad_norm_var": 0.12685139973958334, "learning_rate": 0.0001, "loss": 9.1954, "loss/crossentropy": 2.318281054496765, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.34135954082012177, "step": 1184 }, { "epoch": 0.074125, "grad_norm": 6.40625, "grad_norm_var": 0.4820475260416667, "learning_rate": 0.0001, "loss": 9.4297, "loss/crossentropy": 2.1393545866012573, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3078417629003525, "step": 1186 }, { "epoch": 0.07425, "grad_norm": 3.953125, "grad_norm_var": 0.4628082275390625, "learning_rate": 0.0001, "loss": 9.2797, "loss/crossentropy": 2.2695345878601074, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.35591304302215576, "step": 1188 }, { "epoch": 0.074375, "grad_norm": 4.0625, "grad_norm_var": 0.4542795817057292, "learning_rate": 0.0001, "loss": 9.4351, "loss/crossentropy": 2.353412628173828, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3381505161523819, "step": 1190 }, { "epoch": 0.0745, "grad_norm": 3.8125, "grad_norm_var": 0.43408101399739585, "learning_rate": 0.0001, "loss": 9.3799, "loss/crossentropy": 2.1378670930862427, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3078790009021759, "step": 1192 }, { "epoch": 0.074625, "grad_norm": 4.59375, "grad_norm_var": 0.44195963541666666, "learning_rate": 0.0001, "loss": 9.2594, "loss/crossentropy": 2.667215347290039, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.31856001913547516, "step": 1194 }, { "epoch": 0.07475, "grad_norm": 3.71875, "grad_norm_var": 0.45224202473958336, "learning_rate": 0.0001, "loss": 9.5313, "loss/crossentropy": 2.635989189147949, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3028344362974167, "step": 1196 }, { "epoch": 0.074875, "grad_norm": 3.75, "grad_norm_var": 0.4476470947265625, "learning_rate": 0.0001, "loss": 9.3138, "loss/crossentropy": 2.296648859977722, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3075595498085022, "step": 1198 }, { "epoch": 0.075, "grad_norm": 3.484375, "grad_norm_var": 0.48983968098958336, "learning_rate": 0.0001, "loss": 9.1145, "loss/crossentropy": 2.3698781728744507, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.32711467146873474, "step": 1200 }, { "epoch": 0.075125, "grad_norm": 4.40625, "grad_norm_var": 0.12192281087239583, "learning_rate": 0.0001, "loss": 9.1895, "loss/crossentropy": 2.335735321044922, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.338960200548172, "step": 1202 }, { "epoch": 0.07525, "grad_norm": 4.3125, "grad_norm_var": 0.12410481770833333, "learning_rate": 0.0001, "loss": 9.4811, "loss/crossentropy": 2.410987138748169, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.36192595958709717, "step": 1204 }, { "epoch": 0.075375, "grad_norm": 3.96875, "grad_norm_var": 0.11915690104166667, "learning_rate": 0.0001, "loss": 8.9214, "loss/crossentropy": 1.708968698978424, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.2829563319683075, "step": 1206 }, { "epoch": 0.0755, "grad_norm": 4.21875, "grad_norm_var": 0.12903238932291666, "learning_rate": 0.0001, "loss": 9.2257, "loss/crossentropy": 2.0275624990463257, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3022371232509613, "step": 1208 }, { "epoch": 0.075625, "grad_norm": 4.28125, "grad_norm_var": 0.10429585774739583, "learning_rate": 0.0001, "loss": 9.1546, "loss/crossentropy": 2.370418667793274, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.30917173624038696, "step": 1210 }, { "epoch": 0.07575, "grad_norm": 3.890625, "grad_norm_var": 0.10338134765625, "learning_rate": 0.0001, "loss": 9.3432, "loss/crossentropy": 2.589500308036804, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33214709162712097, "step": 1212 }, { "epoch": 0.075875, "grad_norm": 4.03125, "grad_norm_var": 0.1165191650390625, "learning_rate": 0.0001, "loss": 9.0814, "loss/crossentropy": 2.036685049533844, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.2987980991601944, "step": 1214 }, { "epoch": 0.076, "grad_norm": 3.875, "grad_norm_var": 0.08604227701822917, "learning_rate": 0.0001, "loss": 9.229, "loss/crossentropy": 2.42414653301239, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3331410139799118, "step": 1216 }, { "epoch": 0.076125, "grad_norm": 3.375, "grad_norm_var": 0.08454488118489584, "learning_rate": 0.0001, "loss": 9.0005, "loss/crossentropy": 2.299575924873352, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.31778840720653534, "step": 1218 }, { "epoch": 0.07625, "grad_norm": 3.71875, "grad_norm_var": 0.0822906494140625, "learning_rate": 0.0001, "loss": 8.8719, "loss/crossentropy": 2.1413676738739014, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.311617910861969, "step": 1220 }, { "epoch": 0.076375, "grad_norm": 3.796875, "grad_norm_var": 0.0840240478515625, "learning_rate": 0.0001, "loss": 9.1024, "loss/crossentropy": 2.3143566846847534, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.30187341570854187, "step": 1222 }, { "epoch": 0.0765, "grad_norm": 4.0, "grad_norm_var": 0.06832275390625, "learning_rate": 0.0001, "loss": 9.1304, "loss/crossentropy": 2.3266204595565796, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.31053149700164795, "step": 1224 }, { "epoch": 0.076625, "grad_norm": 3.96875, "grad_norm_var": 0.07386067708333334, "learning_rate": 0.0001, "loss": 9.2652, "loss/crossentropy": 2.536214232444763, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.37166669964790344, "step": 1226 }, { "epoch": 0.07675, "grad_norm": 3.953125, "grad_norm_var": 0.06737874348958334, "learning_rate": 0.0001, "loss": 8.9565, "loss/crossentropy": 2.0512280464172363, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3225468099117279, "step": 1228 }, { "epoch": 0.076875, "grad_norm": 3.765625, "grad_norm_var": 0.05724283854166667, "learning_rate": 0.0001, "loss": 9.0504, "loss/crossentropy": 2.066978871822357, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3017243593931198, "step": 1230 }, { "epoch": 0.077, "grad_norm": 3.921875, "grad_norm_var": 0.056864420572916664, "learning_rate": 0.0001, "loss": 8.8892, "loss/crossentropy": 2.005214273929596, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.29410508275032043, "step": 1232 }, { "epoch": 0.077125, "grad_norm": 4.1875, "grad_norm_var": 0.0547760009765625, "learning_rate": 0.0001, "loss": 9.2019, "loss/crossentropy": 2.558223009109497, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3275268077850342, "step": 1234 }, { "epoch": 0.07725, "grad_norm": 4.125, "grad_norm_var": 0.05310872395833333, "learning_rate": 0.0001, "loss": 9.3537, "loss/crossentropy": 2.270456552505493, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3018851578235626, "step": 1236 }, { "epoch": 0.077375, "grad_norm": 3.421875, "grad_norm_var": 0.06061909993489583, "learning_rate": 0.0001, "loss": 8.8465, "loss/crossentropy": 2.2115548849105835, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3262307494878769, "step": 1238 }, { "epoch": 0.0775, "grad_norm": 3.953125, "grad_norm_var": 0.20393778483072916, "learning_rate": 0.0001, "loss": 8.966, "loss/crossentropy": 2.2800389528274536, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.29695481061935425, "step": 1240 }, { "epoch": 0.077625, "grad_norm": 3.9375, "grad_norm_var": 0.1944976806640625, "learning_rate": 0.0001, "loss": 9.2918, "loss/crossentropy": 2.364116072654724, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3094239979982376, "step": 1242 }, { "epoch": 0.07775, "grad_norm": 4.21875, "grad_norm_var": 0.19739176432291666, "learning_rate": 0.0001, "loss": 8.9968, "loss/crossentropy": 2.254086196422577, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3183470666408539, "step": 1244 }, { "epoch": 0.077875, "grad_norm": 3.921875, "grad_norm_var": 0.20035400390625, "learning_rate": 0.0001, "loss": 9.3359, "loss/crossentropy": 2.2478840351104736, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3539678752422333, "step": 1246 }, { "epoch": 0.078, "grad_norm": 5.03125, "grad_norm_var": 0.2570709228515625, "learning_rate": 0.0001, "loss": 9.1698, "loss/crossentropy": 2.2758136987686157, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.38259510695934296, "step": 1248 }, { "epoch": 0.078125, "grad_norm": 3.859375, "grad_norm_var": 0.2472808837890625, "learning_rate": 0.0001, "loss": 9.6334, "loss/crossentropy": 2.1474008560180664, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.29591962695121765, "step": 1250 }, { "epoch": 0.07825, "grad_norm": 5.1875, "grad_norm_var": 0.31797587076822914, "learning_rate": 0.0001, "loss": 9.2623, "loss/crossentropy": 2.3829517364501953, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3403499126434326, "step": 1252 }, { "epoch": 0.078375, "grad_norm": 6.78125, "grad_norm_var": 0.6898396809895834, "learning_rate": 0.0001, "loss": 9.5364, "loss/crossentropy": 2.2887972593307495, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.3603130578994751, "step": 1254 }, { "epoch": 0.0785, "grad_norm": 6.125, "grad_norm_var": 0.8064198811848958, "learning_rate": 0.0001, "loss": 9.4036, "loss/crossentropy": 2.644718647003174, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3263750374317169, "step": 1256 }, { "epoch": 0.078625, "grad_norm": 3.609375, "grad_norm_var": 0.8487782796223958, "learning_rate": 0.0001, "loss": 8.7724, "loss/crossentropy": 1.8396179676055908, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.274771586060524, "step": 1258 }, { "epoch": 0.07875, "grad_norm": 3.6875, "grad_norm_var": 0.89468994140625, "learning_rate": 0.0001, "loss": 9.1007, "loss/crossentropy": 1.961978793144226, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3170587867498398, "step": 1260 }, { "epoch": 0.078875, "grad_norm": 3.953125, "grad_norm_var": 0.8986002604166666, "learning_rate": 0.0001, "loss": 9.3628, "loss/crossentropy": 2.3426570892333984, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.30132536590099335, "step": 1262 }, { "epoch": 0.079, "grad_norm": 6.0625, "grad_norm_var": 1.0810221354166667, "learning_rate": 0.0001, "loss": 9.6953, "loss/crossentropy": 2.4158884286880493, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.437991201877594, "step": 1264 }, { "epoch": 0.079125, "grad_norm": 4.125, "grad_norm_var": 1.069774373372396, "learning_rate": 0.0001, "loss": 9.0493, "loss/crossentropy": 2.3454242944717407, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.30436156690120697, "step": 1266 }, { "epoch": 0.07925, "grad_norm": 3.546875, "grad_norm_var": 1.032005818684896, "learning_rate": 0.0001, "loss": 9.1778, "loss/crossentropy": 2.103074848651886, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3321879059076309, "step": 1268 }, { "epoch": 0.079375, "grad_norm": 3.90625, "grad_norm_var": 0.6053293863932292, "learning_rate": 0.0001, "loss": 8.891, "loss/crossentropy": 2.1898140907287598, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.29424312710762024, "step": 1270 }, { "epoch": 0.0795, "grad_norm": 3.96875, "grad_norm_var": 0.34118550618489585, "learning_rate": 0.0001, "loss": 9.1928, "loss/crossentropy": 2.27189838886261, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3125281184911728, "step": 1272 }, { "epoch": 0.079625, "grad_norm": 3.578125, "grad_norm_var": 0.35318094889322915, "learning_rate": 0.0001, "loss": 9.0314, "loss/crossentropy": 2.3463791608810425, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3130747973918915, "step": 1274 }, { "epoch": 0.07975, "grad_norm": 3.65625, "grad_norm_var": 0.3703684488932292, "learning_rate": 0.0001, "loss": 9.0928, "loss/crossentropy": 2.312413811683655, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.2839825302362442, "step": 1276 }, { "epoch": 0.079875, "grad_norm": 4.46875, "grad_norm_var": 0.3912760416666667, "learning_rate": 0.0001, "loss": 9.1112, "loss/crossentropy": 2.1585580110549927, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.327612966299057, "step": 1278 }, { "epoch": 0.08, "grad_norm": 3.828125, "grad_norm_var": 0.11272786458333334, "learning_rate": 0.0001, "loss": 9.3805, "loss/crossentropy": 2.376479387283325, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.37470483779907227, "step": 1280 }, { "epoch": 0.080125, "grad_norm": 4.03125, "grad_norm_var": 0.11057942708333333, "learning_rate": 0.0001, "loss": 9.1178, "loss/crossentropy": 2.3756598234176636, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3167227506637573, "step": 1282 }, { "epoch": 0.08025, "grad_norm": 4.53125, "grad_norm_var": 0.12273763020833334, "learning_rate": 0.0001, "loss": 9.0996, "loss/crossentropy": 2.3639968633651733, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.28630542755126953, "step": 1284 }, { "epoch": 0.080375, "grad_norm": 3.8125, "grad_norm_var": 0.11213785807291667, "learning_rate": 0.0001, "loss": 8.9982, "loss/crossentropy": 2.0546599626541138, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.2971881777048111, "step": 1286 }, { "epoch": 0.0805, "grad_norm": 7.125, "grad_norm_var": 0.7559153238932291, "learning_rate": 0.0001, "loss": 9.2574, "loss/crossentropy": 2.209296226501465, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.30241329967975616, "step": 1288 }, { "epoch": 0.080625, "grad_norm": 5.15625, "grad_norm_var": 0.7945302327473959, "learning_rate": 0.0001, "loss": 9.4954, "loss/crossentropy": 2.2742972373962402, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.33018121123313904, "step": 1290 }, { "epoch": 0.08075, "grad_norm": 3.90625, "grad_norm_var": 0.7472005208333333, "learning_rate": 0.0001, "loss": 9.1376, "loss/crossentropy": 2.50320041179657, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.31275102496147156, "step": 1292 }, { "epoch": 0.080875, "grad_norm": 3.8125, "grad_norm_var": 0.7593709309895833, "learning_rate": 0.0001, "loss": 9.2656, "loss/crossentropy": 2.1488747000694275, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.2928170934319496, "step": 1294 }, { "epoch": 0.081, "grad_norm": 3.5625, "grad_norm_var": 0.7713450113932292, "learning_rate": 0.0001, "loss": 9.0506, "loss/crossentropy": 1.9949141144752502, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.2894285097718239, "step": 1296 }, { "epoch": 0.081125, "grad_norm": 3.703125, "grad_norm_var": 0.7880696614583333, "learning_rate": 0.0001, "loss": 9.0447, "loss/crossentropy": 2.1965824961662292, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.32401764392852783, "step": 1298 }, { "epoch": 0.08125, "grad_norm": 3.71875, "grad_norm_var": 0.7814687093098959, "learning_rate": 0.0001, "loss": 9.2897, "loss/crossentropy": 2.4025352001190186, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3065558522939682, "step": 1300 }, { "epoch": 0.081375, "grad_norm": 3.6875, "grad_norm_var": 0.7743560791015625, "learning_rate": 0.0001, "loss": 9.3312, "loss/crossentropy": 2.1072371006011963, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.29968319833278656, "step": 1302 }, { "epoch": 0.0815, "grad_norm": 4.125, "grad_norm_var": 0.1817779541015625, "learning_rate": 0.0001, "loss": 8.9304, "loss/crossentropy": 2.067099928855896, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3024987578392029, "step": 1304 }, { "epoch": 0.081625, "grad_norm": 6.0, "grad_norm_var": 0.32427978515625, "learning_rate": 0.0001, "loss": 9.2762, "loss/crossentropy": 2.2191673517227173, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3357800096273422, "step": 1306 }, { "epoch": 0.08175, "grad_norm": 3.796875, "grad_norm_var": 0.3276601155598958, "learning_rate": 0.0001, "loss": 9.0845, "loss/crossentropy": 2.130507230758667, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.29883837699890137, "step": 1308 }, { "epoch": 0.081875, "grad_norm": 3.734375, "grad_norm_var": 0.33126627604166664, "learning_rate": 0.0001, "loss": 9.4391, "loss/crossentropy": 2.3472490310668945, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3453030586242676, "step": 1310 }, { "epoch": 0.082, "grad_norm": 3.578125, "grad_norm_var": 0.34829813639322915, "learning_rate": 0.0001, "loss": 8.9858, "loss/crossentropy": 2.2621065378189087, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3136780560016632, "step": 1312 }, { "epoch": 0.082125, "grad_norm": 3.546875, "grad_norm_var": 0.3508209228515625, "learning_rate": 0.0001, "loss": 8.9322, "loss/crossentropy": 1.973158359527588, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.2951781302690506, "step": 1314 }, { "epoch": 0.08225, "grad_norm": 4.09375, "grad_norm_var": 0.3529693603515625, "learning_rate": 0.0001, "loss": 9.1387, "loss/crossentropy": 2.1336565017700195, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3254096359014511, "step": 1316 }, { "epoch": 0.082375, "grad_norm": 3.84375, "grad_norm_var": 0.3795857747395833, "learning_rate": 0.0001, "loss": 9.1626, "loss/crossentropy": 2.2021815180778503, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3019871115684509, "step": 1318 }, { "epoch": 0.0825, "grad_norm": 4.4375, "grad_norm_var": 2.168016560872396, "learning_rate": 0.0001, "loss": 9.4591, "loss/crossentropy": 2.3603689670562744, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.31613415479660034, "step": 1320 }, { "epoch": 0.082625, "grad_norm": 3.453125, "grad_norm_var": 2.0103515625, "learning_rate": 0.0001, "loss": 9.1654, "loss/crossentropy": 2.387214779853821, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.34119459986686707, "step": 1322 }, { "epoch": 0.08275, "grad_norm": 3.640625, "grad_norm_var": 2.035106404622396, "learning_rate": 0.0001, "loss": 9.3567, "loss/crossentropy": 2.510946035385132, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.35769546031951904, "step": 1324 }, { "epoch": 0.082875, "grad_norm": 3.65625, "grad_norm_var": 2.0376912434895833, "learning_rate": 0.0001, "loss": 9.0669, "loss/crossentropy": 2.2185367345809937, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3095611035823822, "step": 1326 }, { "epoch": 0.083, "grad_norm": 3.328125, "grad_norm_var": 2.0509104410807293, "learning_rate": 0.0001, "loss": 8.895, "loss/crossentropy": 2.4466623067855835, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3253076821565628, "step": 1328 }, { "epoch": 0.083125, "grad_norm": 3.234375, "grad_norm_var": 2.086107381184896, "learning_rate": 0.0001, "loss": 8.8107, "loss/crossentropy": 1.9973264932632446, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2617499902844429, "step": 1330 }, { "epoch": 0.08325, "grad_norm": 3.46875, "grad_norm_var": 2.1166666666666667, "learning_rate": 0.0001, "loss": 8.8316, "loss/crossentropy": 2.300974130630493, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3343648910522461, "step": 1332 }, { "epoch": 0.083375, "grad_norm": 3.640625, "grad_norm_var": 2.0879058837890625, "learning_rate": 0.0001, "loss": 9.1785, "loss/crossentropy": 2.216305136680603, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33670032024383545, "step": 1334 }, { "epoch": 0.0835, "grad_norm": 3.671875, "grad_norm_var": 0.03583577473958333, "learning_rate": 0.0001, "loss": 9.0382, "loss/crossentropy": 2.33861768245697, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3006826788187027, "step": 1336 }, { "epoch": 0.083625, "grad_norm": 3.25, "grad_norm_var": 0.046484375, "learning_rate": 0.0001, "loss": 8.8885, "loss/crossentropy": 2.2827532291412354, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.30343687534332275, "step": 1338 }, { "epoch": 0.08375, "grad_norm": 3.78125, "grad_norm_var": 0.0532867431640625, "learning_rate": 0.0001, "loss": 9.278, "loss/crossentropy": 2.195745587348938, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.30087001621723175, "step": 1340 }, { "epoch": 0.083875, "grad_norm": 3.8125, "grad_norm_var": 0.044140625, "learning_rate": 0.0001, "loss": 8.9545, "loss/crossentropy": 2.388075351715088, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.31265978515148163, "step": 1342 }, { "epoch": 0.084, "grad_norm": 3.484375, "grad_norm_var": 0.04077860514322917, "learning_rate": 0.0001, "loss": 8.8301, "loss/crossentropy": 2.193402647972107, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.28910548985004425, "step": 1344 }, { "epoch": 0.084125, "grad_norm": 3.578125, "grad_norm_var": 0.03902587890625, "learning_rate": 0.0001, "loss": 8.69, "loss/crossentropy": 2.1855721473693848, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.30405496060848236, "step": 1346 }, { "epoch": 0.08425, "grad_norm": 3.6875, "grad_norm_var": 0.03798828125, "learning_rate": 0.0001, "loss": 8.9597, "loss/crossentropy": 1.9088768362998962, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.30821050703525543, "step": 1348 }, { "epoch": 0.084375, "grad_norm": 3.421875, "grad_norm_var": 0.03740234375, "learning_rate": 0.0001, "loss": 9.0073, "loss/crossentropy": 2.6817314624786377, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.34145255386829376, "step": 1350 }, { "epoch": 0.0845, "grad_norm": 4.40625, "grad_norm_var": 0.08152669270833333, "learning_rate": 0.0001, "loss": 9.3486, "loss/crossentropy": 2.1007471084594727, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.37222176790237427, "step": 1352 }, { "epoch": 0.084625, "grad_norm": 3.953125, "grad_norm_var": 0.8203603108723958, "learning_rate": 0.0001, "loss": 9.182, "loss/crossentropy": 2.3299310207366943, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.34407567977905273, "step": 1354 }, { "epoch": 0.08475, "grad_norm": 4.15625, "grad_norm_var": 1.1480794270833334, "learning_rate": 0.0001, "loss": 9.0186, "loss/crossentropy": 2.0842106342315674, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.29379770159721375, "step": 1356 }, { "epoch": 0.084875, "grad_norm": 4.40625, "grad_norm_var": 1.145654296875, "learning_rate": 0.0001, "loss": 9.1571, "loss/crossentropy": 2.092414438724518, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.30193740129470825, "step": 1358 }, { "epoch": 0.085, "grad_norm": 3.859375, "grad_norm_var": 1.1072550455729167, "learning_rate": 0.0001, "loss": 8.8558, "loss/crossentropy": 2.198947310447693, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3119354546070099, "step": 1360 }, { "epoch": 0.085125, "grad_norm": 4.6875, "grad_norm_var": 1.04185791015625, "learning_rate": 0.0001, "loss": 9.2801, "loss/crossentropy": 2.1080846190452576, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.30948643386363983, "step": 1362 }, { "epoch": 0.08525, "grad_norm": 3.59375, "grad_norm_var": 1.0319488525390625, "learning_rate": 0.0001, "loss": 9.364, "loss/crossentropy": 2.3470795154571533, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.31810642778873444, "step": 1364 }, { "epoch": 0.085375, "grad_norm": 4.15625, "grad_norm_var": 0.9828603108723958, "learning_rate": 0.0001, "loss": 9.1917, "loss/crossentropy": 2.348735809326172, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3358312100172043, "step": 1366 }, { "epoch": 0.0855, "grad_norm": 3.890625, "grad_norm_var": 0.962841796875, "learning_rate": 0.0001, "loss": 9.1159, "loss/crossentropy": 2.24025958776474, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.29087674617767334, "step": 1368 }, { "epoch": 0.085625, "grad_norm": 3.5625, "grad_norm_var": 0.40837300618489586, "learning_rate": 0.0001, "loss": 8.9369, "loss/crossentropy": 1.9485474228858948, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.28918443620204926, "step": 1370 }, { "epoch": 0.08575, "grad_norm": 3.640625, "grad_norm_var": 0.09461263020833334, "learning_rate": 0.0001, "loss": 8.9396, "loss/crossentropy": 2.1872143745422363, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.2917555719614029, "step": 1372 }, { "epoch": 0.085875, "grad_norm": 3.71875, "grad_norm_var": 0.09455973307291667, "learning_rate": 0.0001, "loss": 8.7361, "loss/crossentropy": 2.0125487446784973, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.301084503531456, "step": 1374 }, { "epoch": 0.086, "grad_norm": 3.734375, "grad_norm_var": 0.09461263020833334, "learning_rate": 0.0001, "loss": 9.0682, "loss/crossentropy": 2.587073802947998, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.31948888301849365, "step": 1376 }, { "epoch": 0.086125, "grad_norm": 3.828125, "grad_norm_var": 0.04976806640625, "learning_rate": 0.0001, "loss": 8.9381, "loss/crossentropy": 2.0685659646987915, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3181370496749878, "step": 1378 }, { "epoch": 0.08625, "grad_norm": 3.671875, "grad_norm_var": 0.051904296875, "learning_rate": 0.0001, "loss": 9.0739, "loss/crossentropy": 2.113464593887329, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.2755056321620941, "step": 1380 }, { "epoch": 0.086375, "grad_norm": 3.78125, "grad_norm_var": 0.03990478515625, "learning_rate": 0.0001, "loss": 9.0354, "loss/crossentropy": 2.6173815727233887, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.31043821573257446, "step": 1382 }, { "epoch": 0.0865, "grad_norm": 5.0, "grad_norm_var": 0.1398345947265625, "learning_rate": 0.0001, "loss": 9.2396, "loss/crossentropy": 2.3400750160217285, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3131750673055649, "step": 1384 }, { "epoch": 0.086625, "grad_norm": 3.875, "grad_norm_var": 0.33166402180989585, "learning_rate": 0.0001, "loss": 9.2615, "loss/crossentropy": 2.1931885480880737, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.36186739802360535, "step": 1386 }, { "epoch": 0.08675, "grad_norm": 3.765625, "grad_norm_var": 0.33310546875, "learning_rate": 0.0001, "loss": 8.9166, "loss/crossentropy": 2.1065213680267334, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.32110610604286194, "step": 1388 }, { "epoch": 0.086875, "grad_norm": 8.75, "grad_norm_var": 1.762401326497396, "learning_rate": 0.0001, "loss": 9.6829, "loss/crossentropy": 2.510193705558777, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.5324367135763168, "step": 1390 }, { "epoch": 0.087, "grad_norm": 3.8125, "grad_norm_var": 1.7495269775390625, "learning_rate": 0.0001, "loss": 8.7412, "loss/crossentropy": 2.079869508743286, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.31986746191978455, "step": 1392 }, { "epoch": 0.087125, "grad_norm": 3.828125, "grad_norm_var": 1.7681803385416666, "learning_rate": 0.0001, "loss": 8.9185, "loss/crossentropy": 2.380985975265503, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3713766932487488, "step": 1394 }, { "epoch": 0.08725, "grad_norm": 3.546875, "grad_norm_var": 1.7469472249348958, "learning_rate": 0.0001, "loss": 8.7602, "loss/crossentropy": 1.9419381022453308, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.255826435983181, "step": 1396 }, { "epoch": 0.087375, "grad_norm": 3.734375, "grad_norm_var": 1.74287109375, "learning_rate": 0.0001, "loss": 9.0714, "loss/crossentropy": 2.2587140798568726, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3152017891407013, "step": 1398 }, { "epoch": 0.0875, "grad_norm": 3.734375, "grad_norm_var": 1.7314615885416667, "learning_rate": 0.0001, "loss": 9.0489, "loss/crossentropy": 2.3675363063812256, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.2992345988750458, "step": 1400 }, { "epoch": 0.087625, "grad_norm": 3.53125, "grad_norm_var": 1.6247548421223958, "learning_rate": 0.0001, "loss": 8.8616, "loss/crossentropy": 2.149065613746643, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.29075902700424194, "step": 1402 }, { "epoch": 0.08775, "grad_norm": 4.125, "grad_norm_var": 1.6389556884765626, "learning_rate": 0.0001, "loss": 9.0005, "loss/crossentropy": 2.608632802963257, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.301875039935112, "step": 1404 }, { "epoch": 0.087875, "grad_norm": 3.984375, "grad_norm_var": 0.07696940104166666, "learning_rate": 0.0001, "loss": 9.2496, "loss/crossentropy": 2.3347132205963135, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.38300836086273193, "step": 1406 }, { "epoch": 0.088, "grad_norm": 3.734375, "grad_norm_var": 0.04844462076822917, "learning_rate": 0.0001, "loss": 8.9987, "loss/crossentropy": 2.233608603477478, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3263901472091675, "step": 1408 }, { "epoch": 0.088125, "grad_norm": 4.40625, "grad_norm_var": 0.06700846354166666, "learning_rate": 0.0001, "loss": 9.1953, "loss/crossentropy": 2.1250086426734924, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.29557597637176514, "step": 1410 }, { "epoch": 0.08825, "grad_norm": 3.390625, "grad_norm_var": 0.10357666015625, "learning_rate": 0.0001, "loss": 8.8501, "loss/crossentropy": 2.1611350178718567, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3238576501607895, "step": 1412 }, { "epoch": 0.088375, "grad_norm": 3.328125, "grad_norm_var": 0.11812744140625, "learning_rate": 0.0001, "loss": 8.9842, "loss/crossentropy": 2.3532297611236572, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.2911015599966049, "step": 1414 }, { "epoch": 0.0885, "grad_norm": 3.765625, "grad_norm_var": 0.11819254557291667, "learning_rate": 0.0001, "loss": 8.7732, "loss/crossentropy": 2.068236827850342, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3031652122735977, "step": 1416 }, { "epoch": 0.088625, "grad_norm": 3.578125, "grad_norm_var": 0.11331278483072917, "learning_rate": 0.0001, "loss": 9.1532, "loss/crossentropy": 2.1716268062591553, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.31497564911842346, "step": 1418 }, { "epoch": 0.08875, "grad_norm": 4.09375, "grad_norm_var": 0.100634765625, "learning_rate": 0.0001, "loss": 9.0399, "loss/crossentropy": 2.0323649048805237, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.27277450263500214, "step": 1420 }, { "epoch": 0.088875, "grad_norm": 4.03125, "grad_norm_var": 0.098681640625, "learning_rate": 0.0001, "loss": 8.9242, "loss/crossentropy": 2.1079633831977844, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.30429257452487946, "step": 1422 }, { "epoch": 0.089, "grad_norm": 4.0, "grad_norm_var": 0.10099995930989583, "learning_rate": 0.0001, "loss": 9.2486, "loss/crossentropy": 2.4154094457626343, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.36461447179317474, "step": 1424 }, { "epoch": 0.089125, "grad_norm": 3.296875, "grad_norm_var": 0.09290262858072916, "learning_rate": 0.0001, "loss": 8.8973, "loss/crossentropy": 2.3857645988464355, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30349257588386536, "step": 1426 }, { "epoch": 0.08925, "grad_norm": 11.5625, "grad_norm_var": 3.882080078125, "learning_rate": 0.0001, "loss": 9.3002, "loss/crossentropy": 2.381242036819458, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.44322696328163147, "step": 1428 }, { "epoch": 0.089375, "grad_norm": 4.1875, "grad_norm_var": 3.7999501546223957, "learning_rate": 0.0001, "loss": 9.0099, "loss/crossentropy": 2.2321606874465942, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3364466726779938, "step": 1430 }, { "epoch": 0.0895, "grad_norm": 3.421875, "grad_norm_var": 3.827408854166667, "learning_rate": 0.0001, "loss": 9.1032, "loss/crossentropy": 2.343966007232666, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.33518894016742706, "step": 1432 }, { "epoch": 0.089625, "grad_norm": 3.375, "grad_norm_var": 3.845894368489583, "learning_rate": 0.0001, "loss": 9.0993, "loss/crossentropy": 2.393994450569153, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.29674217104911804, "step": 1434 }, { "epoch": 0.08975, "grad_norm": 3.421875, "grad_norm_var": 3.918683878580729, "learning_rate": 0.0001, "loss": 8.7823, "loss/crossentropy": 2.2474565505981445, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.27851349115371704, "step": 1436 }, { "epoch": 0.089875, "grad_norm": 3.4375, "grad_norm_var": 3.988167317708333, "learning_rate": 0.0001, "loss": 8.6073, "loss/crossentropy": 2.0842167139053345, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2909235507249832, "step": 1438 }, { "epoch": 0.09, "grad_norm": 3.609375, "grad_norm_var": 4.016567993164062, "learning_rate": 0.0001, "loss": 9.0951, "loss/crossentropy": 2.3772757053375244, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.31073370575904846, "step": 1440 }, { "epoch": 0.090125, "grad_norm": 4.0625, "grad_norm_var": 3.939159138997396, "learning_rate": 0.0001, "loss": 8.9393, "loss/crossentropy": 2.2321159839630127, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3076823353767395, "step": 1442 }, { "epoch": 0.09025, "grad_norm": 3.375, "grad_norm_var": 0.11077473958333334, "learning_rate": 0.0001, "loss": 8.9251, "loss/crossentropy": 2.1912073493003845, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.2842486649751663, "step": 1444 }, { "epoch": 0.090375, "grad_norm": 3.28125, "grad_norm_var": 0.05536702473958333, "learning_rate": 0.0001, "loss": 8.876, "loss/crossentropy": 2.3551260232925415, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30490826070308685, "step": 1446 }, { "epoch": 0.0905, "grad_norm": 3.453125, "grad_norm_var": 0.053141276041666664, "learning_rate": 0.0001, "loss": 9.1948, "loss/crossentropy": 2.4481849670410156, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.31339772045612335, "step": 1448 }, { "epoch": 0.090625, "grad_norm": 3.34375, "grad_norm_var": 0.0527496337890625, "learning_rate": 0.0001, "loss": 8.9187, "loss/crossentropy": 2.4239214658737183, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.28998003900051117, "step": 1450 }, { "epoch": 0.09075, "grad_norm": 3.875, "grad_norm_var": 0.1454986572265625, "learning_rate": 0.0001, "loss": 9.072, "loss/crossentropy": 2.325495958328247, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.318745493888855, "step": 1452 }, { "epoch": 0.090875, "grad_norm": 3.59375, "grad_norm_var": 0.13629557291666666, "learning_rate": 0.0001, "loss": 9.1969, "loss/crossentropy": 2.2769020795822144, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.35522060096263885, "step": 1454 }, { "epoch": 0.091, "grad_norm": 3.296875, "grad_norm_var": 0.14644775390625, "learning_rate": 0.0001, "loss": 9.0135, "loss/crossentropy": 1.9657680988311768, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2873254120349884, "step": 1456 }, { "epoch": 0.091125, "grad_norm": 3.84375, "grad_norm_var": 0.14329020182291666, "learning_rate": 0.0001, "loss": 9.3595, "loss/crossentropy": 2.0547165870666504, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.31567882001399994, "step": 1458 }, { "epoch": 0.09125, "grad_norm": 3.328125, "grad_norm_var": 0.14524637858072917, "learning_rate": 0.0001, "loss": 8.6653, "loss/crossentropy": 2.1225805282592773, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.29530632495880127, "step": 1460 }, { "epoch": 0.091375, "grad_norm": 15.75, "grad_norm_var": 9.23115946451823, "learning_rate": 0.0001, "loss": 9.0634, "loss/crossentropy": 2.110310137271881, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3338662087917328, "step": 1462 }, { "epoch": 0.0915, "grad_norm": 3.625, "grad_norm_var": 9.184105428059896, "learning_rate": 0.0001, "loss": 9.2073, "loss/crossentropy": 2.3429659605026245, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3294520825147629, "step": 1464 }, { "epoch": 0.091625, "grad_norm": 4.34375, "grad_norm_var": 9.143276977539063, "learning_rate": 0.0001, "loss": 8.8513, "loss/crossentropy": 2.3899245262145996, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.2869112491607666, "step": 1466 }, { "epoch": 0.09175, "grad_norm": 3.6875, "grad_norm_var": 9.187165323893229, "learning_rate": 0.0001, "loss": 9.042, "loss/crossentropy": 2.2129557132720947, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.2893831729888916, "step": 1468 }, { "epoch": 0.091875, "grad_norm": 3.953125, "grad_norm_var": 9.14636942545573, "learning_rate": 0.0001, "loss": 9.2403, "loss/crossentropy": 2.323067307472229, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3108373284339905, "step": 1470 }, { "epoch": 0.092, "grad_norm": 3.375, "grad_norm_var": 9.182405598958333, "learning_rate": 0.0001, "loss": 8.8441, "loss/crossentropy": 2.4471434354782104, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.30828359723091125, "step": 1472 }, { "epoch": 0.092125, "grad_norm": 3.4375, "grad_norm_var": 9.286442057291667, "learning_rate": 0.0001, "loss": 8.7897, "loss/crossentropy": 2.284360885620117, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.28711090981960297, "step": 1474 }, { "epoch": 0.09225, "grad_norm": 3.25, "grad_norm_var": 9.285087076822917, "learning_rate": 0.0001, "loss": 8.6543, "loss/crossentropy": 2.1655644178390503, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2804734408855438, "step": 1476 }, { "epoch": 0.092375, "grad_norm": 3.40625, "grad_norm_var": 0.09638671875, "learning_rate": 0.0001, "loss": 8.9708, "loss/crossentropy": 2.5127278566360474, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.29755330085754395, "step": 1478 }, { "epoch": 0.0925, "grad_norm": 3.453125, "grad_norm_var": 0.08045247395833334, "learning_rate": 0.0001, "loss": 8.916, "loss/crossentropy": 2.3931394815444946, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.302903413772583, "step": 1480 }, { "epoch": 0.092625, "grad_norm": 3.8125, "grad_norm_var": 0.04517822265625, "learning_rate": 0.0001, "loss": 9.4703, "loss/crossentropy": 2.5292797088623047, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3174290806055069, "step": 1482 }, { "epoch": 0.09275, "grad_norm": 3.640625, "grad_norm_var": 0.041552734375, "learning_rate": 0.0001, "loss": 9.1167, "loss/crossentropy": 2.4334981441497803, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.29388993978500366, "step": 1484 }, { "epoch": 0.092875, "grad_norm": 3.65625, "grad_norm_var": 0.04499409993489583, "learning_rate": 0.0001, "loss": 9.0964, "loss/crossentropy": 2.441982388496399, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3226486146450043, "step": 1486 }, { "epoch": 0.093, "grad_norm": 3.5625, "grad_norm_var": 0.04394124348958333, "learning_rate": 0.0001, "loss": 8.9111, "loss/crossentropy": 2.227149486541748, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.31223711371421814, "step": 1488 }, { "epoch": 0.093125, "grad_norm": 3.5, "grad_norm_var": 0.04195556640625, "learning_rate": 0.0001, "loss": 8.7153, "loss/crossentropy": 1.9171633124351501, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.2628061920404434, "step": 1490 }, { "epoch": 0.09325, "grad_norm": 4.1875, "grad_norm_var": 0.05587565104166667, "learning_rate": 0.0001, "loss": 9.083, "loss/crossentropy": 2.4501841068267822, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.30108408629894257, "step": 1492 }, { "epoch": 0.093375, "grad_norm": 3.859375, "grad_norm_var": 0.054011027018229164, "learning_rate": 0.0001, "loss": 8.6752, "loss/crossentropy": 2.2503867149353027, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3120990991592407, "step": 1494 }, { "epoch": 0.0935, "grad_norm": 3.609375, "grad_norm_var": 0.050126139322916666, "learning_rate": 0.0001, "loss": 8.9822, "loss/crossentropy": 2.3671040534973145, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3191666901111603, "step": 1496 }, { "epoch": 0.093625, "grad_norm": 3.359375, "grad_norm_var": 0.07618815104166667, "learning_rate": 0.0001, "loss": 8.6951, "loss/crossentropy": 2.075209140777588, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.2776128873229027, "step": 1498 }, { "epoch": 0.09375, "grad_norm": 3.484375, "grad_norm_var": 0.09049072265625, "learning_rate": 0.0001, "loss": 8.7726, "loss/crossentropy": 2.350838303565979, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.29100096225738525, "step": 1500 }, { "epoch": 0.093875, "grad_norm": 3.5625, "grad_norm_var": 0.0909576416015625, "learning_rate": 0.0001, "loss": 9.0944, "loss/crossentropy": 2.2875425815582275, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.284750759601593, "step": 1502 }, { "epoch": 0.094, "grad_norm": 3.96875, "grad_norm_var": 0.09234110514322917, "learning_rate": 0.0001, "loss": 8.8064, "loss/crossentropy": 2.155337333679199, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2878541350364685, "step": 1504 }, { "epoch": 0.094125, "grad_norm": 3.9375, "grad_norm_var": 0.0933990478515625, "learning_rate": 0.0001, "loss": 9.0625, "loss/crossentropy": 2.1204712986946106, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3001447916030884, "step": 1506 }, { "epoch": 0.09425, "grad_norm": 3.46875, "grad_norm_var": 0.08415425618489583, "learning_rate": 0.0001, "loss": 8.9478, "loss/crossentropy": 2.261082649230957, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.30300605297088623, "step": 1508 }, { "epoch": 0.094375, "grad_norm": 3.921875, "grad_norm_var": 0.09697163899739583, "learning_rate": 0.0001, "loss": 8.5129, "loss/crossentropy": 1.9053804278373718, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.28153228759765625, "step": 1510 }, { "epoch": 0.0945, "grad_norm": 3.5625, "grad_norm_var": 0.1041656494140625, "learning_rate": 0.0001, "loss": 8.9368, "loss/crossentropy": 2.2206510305404663, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.2887195497751236, "step": 1512 }, { "epoch": 0.094625, "grad_norm": 3.703125, "grad_norm_var": 0.06965738932291667, "learning_rate": 0.0001, "loss": 9.0568, "loss/crossentropy": 2.3785996437072754, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.31852445006370544, "step": 1514 }, { "epoch": 0.09475, "grad_norm": 3.546875, "grad_norm_var": 0.07568257649739583, "learning_rate": 0.0001, "loss": 8.9885, "loss/crossentropy": 2.36277174949646, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.30836641788482666, "step": 1516 }, { "epoch": 0.094875, "grad_norm": 3.421875, "grad_norm_var": 0.07068684895833334, "learning_rate": 0.0001, "loss": 9.2226, "loss/crossentropy": 2.3763426542282104, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.29590751230716705, "step": 1518 }, { "epoch": 0.095, "grad_norm": 3.625, "grad_norm_var": 0.06292215983072917, "learning_rate": 0.0001, "loss": 9.0313, "loss/crossentropy": 2.1982592344284058, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.30034808814525604, "step": 1520 }, { "epoch": 0.095125, "grad_norm": 4.03125, "grad_norm_var": 0.0716796875, "learning_rate": 0.0001, "loss": 8.9631, "loss/crossentropy": 2.4760122299194336, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2868836522102356, "step": 1522 }, { "epoch": 0.09525, "grad_norm": 4.0625, "grad_norm_var": 0.07799072265625, "learning_rate": 0.0001, "loss": 8.9311, "loss/crossentropy": 2.0417147874832153, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.29642339050769806, "step": 1524 }, { "epoch": 0.095375, "grad_norm": 3.578125, "grad_norm_var": 0.05696512858072917, "learning_rate": 0.0001, "loss": 8.7245, "loss/crossentropy": 2.2432550191879272, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3071749359369278, "step": 1526 }, { "epoch": 0.0955, "grad_norm": 3.515625, "grad_norm_var": 0.05774332682291667, "learning_rate": 0.0001, "loss": 9.0378, "loss/crossentropy": 2.345797300338745, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3259291499853134, "step": 1528 }, { "epoch": 0.095625, "grad_norm": 3.515625, "grad_norm_var": 0.05821024576822917, "learning_rate": 0.0001, "loss": 8.8138, "loss/crossentropy": 2.2577234506607056, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3370388150215149, "step": 1530 }, { "epoch": 0.09575, "grad_norm": 4.40625, "grad_norm_var": 0.08303120930989584, "learning_rate": 0.0001, "loss": 8.8964, "loss/crossentropy": 1.981217086315155, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3404854089021683, "step": 1532 }, { "epoch": 0.095875, "grad_norm": 3.59375, "grad_norm_var": 0.0755859375, "learning_rate": 0.0001, "loss": 8.5834, "loss/crossentropy": 2.1789305210113525, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.30090533196926117, "step": 1534 }, { "epoch": 0.096, "grad_norm": 3.546875, "grad_norm_var": 0.09191792805989583, "learning_rate": 0.0001, "loss": 8.8597, "loss/crossentropy": 1.992014229297638, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3228597491979599, "step": 1536 }, { "epoch": 0.096125, "grad_norm": 3.6875, "grad_norm_var": 0.36575520833333336, "learning_rate": 0.0001, "loss": 9.0713, "loss/crossentropy": 2.329966425895691, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3234909027814865, "step": 1538 }, { "epoch": 0.09625, "grad_norm": 3.90625, "grad_norm_var": 0.3859334309895833, "learning_rate": 0.0001, "loss": 9.0135, "loss/crossentropy": 2.41361665725708, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.2853923887014389, "step": 1540 }, { "epoch": 0.096375, "grad_norm": 3.140625, "grad_norm_var": 0.4335245768229167, "learning_rate": 0.0001, "loss": 8.4879, "loss/crossentropy": 2.288613796234131, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.2597767412662506, "step": 1542 }, { "epoch": 0.0965, "grad_norm": 6.59375, "grad_norm_var": 0.87587890625, "learning_rate": 0.0001, "loss": 9.2222, "loss/crossentropy": 2.175418972969055, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3049424886703491, "step": 1544 }, { "epoch": 0.096625, "grad_norm": 3.5, "grad_norm_var": 0.8698883056640625, "learning_rate": 0.0001, "loss": 8.9722, "loss/crossentropy": 2.263410449028015, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.31041011214256287, "step": 1546 }, { "epoch": 0.09675, "grad_norm": 4.3125, "grad_norm_var": 1.2735514322916666, "learning_rate": 0.0001, "loss": 9.241, "loss/crossentropy": 2.399603247642517, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.31990472972393036, "step": 1548 }, { "epoch": 0.096875, "grad_norm": 3.546875, "grad_norm_var": 1.321484375, "learning_rate": 0.0001, "loss": 9.138, "loss/crossentropy": 2.046228289604187, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.30006906390190125, "step": 1550 }, { "epoch": 0.097, "grad_norm": 3.5, "grad_norm_var": 1.3254791259765626, "learning_rate": 0.0001, "loss": 8.9087, "loss/crossentropy": 2.1539169549942017, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3151702433824539, "step": 1552 }, { "epoch": 0.097125, "grad_norm": 3.6875, "grad_norm_var": 1.212970987955729, "learning_rate": 0.0001, "loss": 8.9283, "loss/crossentropy": 2.50217342376709, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.28748680651187897, "step": 1554 }, { "epoch": 0.09725, "grad_norm": 3.546875, "grad_norm_var": 1.2947336832682292, "learning_rate": 0.0001, "loss": 8.6031, "loss/crossentropy": 2.1512579917907715, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.2983407974243164, "step": 1556 }, { "epoch": 0.097375, "grad_norm": 4.09375, "grad_norm_var": 1.1539296468098958, "learning_rate": 0.0001, "loss": 8.9816, "loss/crossentropy": 2.3176417350769043, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.3164384812116623, "step": 1558 }, { "epoch": 0.0975, "grad_norm": 4.34375, "grad_norm_var": 0.7965983072916667, "learning_rate": 0.0001, "loss": 9.3022, "loss/crossentropy": 2.202598214149475, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3014833629131317, "step": 1560 }, { "epoch": 0.097625, "grad_norm": 3.53125, "grad_norm_var": 0.7975250244140625, "learning_rate": 0.0001, "loss": 8.9839, "loss/crossentropy": 2.6111624240875244, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3125310689210892, "step": 1562 }, { "epoch": 0.09775, "grad_norm": 3.53125, "grad_norm_var": 0.31539306640625, "learning_rate": 0.0001, "loss": 8.9879, "loss/crossentropy": 2.4788397550582886, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2863108217716217, "step": 1564 }, { "epoch": 0.097875, "grad_norm": 3.359375, "grad_norm_var": 0.15663960774739583, "learning_rate": 0.0001, "loss": 8.7361, "loss/crossentropy": 2.510140299797058, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.29954764246940613, "step": 1566 }, { "epoch": 0.098, "grad_norm": 3.71875, "grad_norm_var": 0.10305074055989584, "learning_rate": 0.0001, "loss": 9.0333, "loss/crossentropy": 2.620720624923706, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.33839190006256104, "step": 1568 }, { "epoch": 0.098125, "grad_norm": 3.78125, "grad_norm_var": 0.11689046223958334, "learning_rate": 0.0001, "loss": 8.8276, "loss/crossentropy": 2.252309799194336, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.27855412662029266, "step": 1570 }, { "epoch": 0.09825, "grad_norm": 3.75, "grad_norm_var": 0.10963134765625, "learning_rate": 0.0001, "loss": 8.6684, "loss/crossentropy": 2.149215817451477, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3184356689453125, "step": 1572 }, { "epoch": 0.098375, "grad_norm": 3.703125, "grad_norm_var": 0.11996968587239583, "learning_rate": 0.0001, "loss": 8.9074, "loss/crossentropy": 2.234324097633362, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.29477037489414215, "step": 1574 }, { "epoch": 0.0985, "grad_norm": 3.515625, "grad_norm_var": 0.08814697265625, "learning_rate": 0.0001, "loss": 9.1442, "loss/crossentropy": 2.3028059005737305, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.2830100804567337, "step": 1576 }, { "epoch": 0.098625, "grad_norm": 3.75, "grad_norm_var": 0.0841217041015625, "learning_rate": 0.0001, "loss": 8.8454, "loss/crossentropy": 2.4806586503982544, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3144974857568741, "step": 1578 }, { "epoch": 0.09875, "grad_norm": 3.453125, "grad_norm_var": 0.08088785807291667, "learning_rate": 0.0001, "loss": 8.8154, "loss/crossentropy": 2.412961959838867, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3073071539402008, "step": 1580 }, { "epoch": 0.098875, "grad_norm": 3.421875, "grad_norm_var": 0.1001861572265625, "learning_rate": 0.0001, "loss": 8.56, "loss/crossentropy": 2.061438202857971, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.27945011854171753, "step": 1582 }, { "epoch": 0.099, "grad_norm": 3.90625, "grad_norm_var": 0.10096028645833334, "learning_rate": 0.0001, "loss": 9.0916, "loss/crossentropy": 2.4244210720062256, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.31236982345581055, "step": 1584 }, { "epoch": 0.099125, "grad_norm": 3.5, "grad_norm_var": 0.10036519368489584, "learning_rate": 0.0001, "loss": 8.6853, "loss/crossentropy": 2.1662213802337646, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2974780946969986, "step": 1586 }, { "epoch": 0.09925, "grad_norm": 3.828125, "grad_norm_var": 0.11070556640625, "learning_rate": 0.0001, "loss": 8.7188, "loss/crossentropy": 2.209104061126709, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2640179917216301, "step": 1588 }, { "epoch": 0.099375, "grad_norm": 3.96875, "grad_norm_var": 0.06985677083333333, "learning_rate": 0.0001, "loss": 8.9161, "loss/crossentropy": 2.2211254835128784, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.29335029423236847, "step": 1590 }, { "epoch": 0.0995, "grad_norm": 3.28125, "grad_norm_var": 0.08837788899739583, "learning_rate": 0.0001, "loss": 8.5428, "loss/crossentropy": 2.3369797468185425, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.27689458429813385, "step": 1592 }, { "epoch": 0.099625, "grad_norm": 3.671875, "grad_norm_var": 0.08738505045572917, "learning_rate": 0.0001, "loss": 8.7455, "loss/crossentropy": 2.475472331047058, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.283773735165596, "step": 1594 }, { "epoch": 0.09975, "grad_norm": 3.53125, "grad_norm_var": 0.08677469889322917, "learning_rate": 0.0001, "loss": 8.7417, "loss/crossentropy": 2.2405002117156982, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.29159049689769745, "step": 1596 }, { "epoch": 0.099875, "grad_norm": 3.5, "grad_norm_var": 0.054352823893229166, "learning_rate": 0.0001, "loss": 8.9507, "loss/crossentropy": 2.119444251060486, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.2701597958803177, "step": 1598 }, { "epoch": 0.1, "grad_norm": 3.34375, "grad_norm_var": 0.04915262858072917, "learning_rate": 0.0001, "loss": 8.9078, "loss/crossentropy": 1.9827104210853577, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.271870881319046, "step": 1600 }, { "epoch": 0.100125, "grad_norm": 4.1875, "grad_norm_var": 0.07768452962239583, "learning_rate": 0.0001, "loss": 8.445, "loss/crossentropy": 1.849865972995758, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2567581683397293, "step": 1602 }, { "epoch": 0.10025, "grad_norm": 4.0, "grad_norm_var": 0.07940165201822917, "learning_rate": 0.0001, "loss": 8.974, "loss/crossentropy": 2.542338013648987, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3163818120956421, "step": 1604 }, { "epoch": 0.100375, "grad_norm": 3.390625, "grad_norm_var": 0.07452799479166666, "learning_rate": 0.0001, "loss": 8.7601, "loss/crossentropy": 2.4402318000793457, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.32151587307453156, "step": 1606 }, { "epoch": 0.1005, "grad_norm": 3.484375, "grad_norm_var": 0.0613677978515625, "learning_rate": 0.0001, "loss": 8.7432, "loss/crossentropy": 2.2898634672164917, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.2893410176038742, "step": 1608 }, { "epoch": 0.100625, "grad_norm": 3.484375, "grad_norm_var": 0.06580403645833334, "learning_rate": 0.0001, "loss": 9.1135, "loss/crossentropy": 1.9438655376434326, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.30355459451675415, "step": 1610 }, { "epoch": 0.10075, "grad_norm": 3.296875, "grad_norm_var": 0.06982014973958334, "learning_rate": 0.0001, "loss": 8.5993, "loss/crossentropy": 2.1151537895202637, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2977646142244339, "step": 1612 }, { "epoch": 0.100875, "grad_norm": 4.15625, "grad_norm_var": 6.615803019205729, "learning_rate": 0.0001, "loss": 9.1827, "loss/crossentropy": 2.446377754211426, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.377622589468956, "step": 1614 }, { "epoch": 0.101, "grad_norm": 3.234375, "grad_norm_var": 6.618659464518229, "learning_rate": 0.0001, "loss": 9.0182, "loss/crossentropy": 2.0556036829948425, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.28740599751472473, "step": 1616 }, { "epoch": 0.101125, "grad_norm": 3.4375, "grad_norm_var": 6.631354777018229, "learning_rate": 0.0001, "loss": 8.9975, "loss/crossentropy": 2.304566264152527, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.2804791033267975, "step": 1618 }, { "epoch": 0.10125, "grad_norm": 3.46875, "grad_norm_var": 6.6969146728515625, "learning_rate": 0.0001, "loss": 8.694, "loss/crossentropy": 2.2216708660125732, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2794913947582245, "step": 1620 }, { "epoch": 0.101375, "grad_norm": 3.890625, "grad_norm_var": 8.288785807291667, "learning_rate": 0.0001, "loss": 9.264, "loss/crossentropy": 2.435174822807312, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.4018533527851105, "step": 1622 }, { "epoch": 0.1015, "grad_norm": 3.375, "grad_norm_var": 8.238833618164062, "learning_rate": 0.0001, "loss": 8.501, "loss/crossentropy": 1.887346863746643, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2690751999616623, "step": 1624 }, { "epoch": 0.101625, "grad_norm": 4.03125, "grad_norm_var": 8.101903279622396, "learning_rate": 0.0001, "loss": 8.9455, "loss/crossentropy": 2.3377827405929565, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29517775774002075, "step": 1626 }, { "epoch": 0.10175, "grad_norm": 4.0625, "grad_norm_var": 8.028579711914062, "learning_rate": 0.0001, "loss": 8.8202, "loss/crossentropy": 2.1615554094314575, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3005082905292511, "step": 1628 }, { "epoch": 0.101875, "grad_norm": 3.671875, "grad_norm_var": 2.107893880208333, "learning_rate": 0.0001, "loss": 8.9428, "loss/crossentropy": 2.3544455766677856, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3055412173271179, "step": 1630 }, { "epoch": 0.102, "grad_norm": 3.234375, "grad_norm_var": 2.1177968343098956, "learning_rate": 0.0001, "loss": 8.6612, "loss/crossentropy": 2.412680745124817, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.28455087542533875, "step": 1632 }, { "epoch": 0.102125, "grad_norm": 3.359375, "grad_norm_var": 2.1278717041015627, "learning_rate": 0.0001, "loss": 8.6029, "loss/crossentropy": 2.032555937767029, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2699122503399849, "step": 1634 }, { "epoch": 0.10225, "grad_norm": 3.5, "grad_norm_var": 2.1118072509765624, "learning_rate": 0.0001, "loss": 8.7099, "loss/crossentropy": 2.4264371395111084, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.30828195810317993, "step": 1636 }, { "epoch": 0.102375, "grad_norm": 3.296875, "grad_norm_var": 0.23749898274739584, "learning_rate": 0.0001, "loss": 8.596, "loss/crossentropy": 2.133467674255371, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2638777419924736, "step": 1638 }, { "epoch": 0.1025, "grad_norm": 3.421875, "grad_norm_var": 0.25396728515625, "learning_rate": 0.0001, "loss": 8.8445, "loss/crossentropy": 2.190924048423767, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.26482895016670227, "step": 1640 }, { "epoch": 0.102625, "grad_norm": 3.46875, "grad_norm_var": 0.2435943603515625, "learning_rate": 0.0001, "loss": 8.7939, "loss/crossentropy": 2.3963547945022583, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.31311626732349396, "step": 1642 }, { "epoch": 0.10275, "grad_norm": 3.4375, "grad_norm_var": 0.27591044108072915, "learning_rate": 0.0001, "loss": 8.495, "loss/crossentropy": 2.1265352964401245, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.26110444962978363, "step": 1644 }, { "epoch": 0.102875, "grad_norm": 5.9375, "grad_norm_var": 0.5888509114583333, "learning_rate": 0.0001, "loss": 9.5281, "loss/crossentropy": 2.4642388820648193, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.48999957740306854, "step": 1646 }, { "epoch": 0.103, "grad_norm": 4.15625, "grad_norm_var": 0.5702626546223958, "learning_rate": 0.0001, "loss": 8.9299, "loss/crossentropy": 2.327626943588257, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.2904190868139267, "step": 1648 }, { "epoch": 0.103125, "grad_norm": 3.59375, "grad_norm_var": 0.5589914957682292, "learning_rate": 0.0001, "loss": 8.8149, "loss/crossentropy": 2.153541386127472, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2824144512414932, "step": 1650 }, { "epoch": 0.10325, "grad_norm": 3.484375, "grad_norm_var": 0.5656412760416667, "learning_rate": 0.0001, "loss": 8.5516, "loss/crossentropy": 2.1480560898780823, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.2773754894733429, "step": 1652 }, { "epoch": 0.103375, "grad_norm": 3.390625, "grad_norm_var": 0.45019124348958334, "learning_rate": 0.0001, "loss": 8.4516, "loss/crossentropy": 2.243725061416626, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2610960602760315, "step": 1654 }, { "epoch": 0.1035, "grad_norm": 3.890625, "grad_norm_var": 0.44556884765625, "learning_rate": 0.0001, "loss": 8.5117, "loss/crossentropy": 2.2539373636245728, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2900232970714569, "step": 1656 }, { "epoch": 0.103625, "grad_norm": 3.5, "grad_norm_var": 0.4426503499348958, "learning_rate": 0.0001, "loss": 8.8094, "loss/crossentropy": 2.20787513256073, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.281345397233963, "step": 1658 }, { "epoch": 0.10375, "grad_norm": 3.6875, "grad_norm_var": 0.40562235514322914, "learning_rate": 0.0001, "loss": 8.5381, "loss/crossentropy": 2.4937009811401367, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.29782480001449585, "step": 1660 }, { "epoch": 0.103875, "grad_norm": 3.34375, "grad_norm_var": 0.060205078125, "learning_rate": 0.0001, "loss": 8.5589, "loss/crossentropy": 1.8507967591285706, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.2902481257915497, "step": 1662 }, { "epoch": 0.104, "grad_norm": 3.609375, "grad_norm_var": 0.03362223307291667, "learning_rate": 0.0001, "loss": 8.6672, "loss/crossentropy": 2.3311851024627686, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.31640036404132843, "step": 1664 }, { "epoch": 0.104125, "grad_norm": 3.265625, "grad_norm_var": 0.03610026041666667, "learning_rate": 0.0001, "loss": 8.6649, "loss/crossentropy": 2.20018208026886, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.26927848160266876, "step": 1666 }, { "epoch": 0.10425, "grad_norm": 3.3125, "grad_norm_var": 0.04209696451822917, "learning_rate": 0.0001, "loss": 8.6148, "loss/crossentropy": 2.3382433652877808, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.28644439578056335, "step": 1668 }, { "epoch": 0.104375, "grad_norm": 3.671875, "grad_norm_var": 0.045563761393229166, "learning_rate": 0.0001, "loss": 8.502, "loss/crossentropy": 2.082834839820862, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2668560594320297, "step": 1670 }, { "epoch": 0.1045, "grad_norm": 3.296875, "grad_norm_var": 0.026691691080729166, "learning_rate": 0.0001, "loss": 8.6683, "loss/crossentropy": 2.0344194769859314, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2505979910492897, "step": 1672 }, { "epoch": 0.104625, "grad_norm": 3.328125, "grad_norm_var": 0.027176920572916666, "learning_rate": 0.0001, "loss": 8.7809, "loss/crossentropy": 2.3554714918136597, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.29616880416870117, "step": 1674 }, { "epoch": 0.10475, "grad_norm": 3.875, "grad_norm_var": 0.03427734375, "learning_rate": 0.0001, "loss": 8.883, "loss/crossentropy": 2.415210723876953, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.29819734394550323, "step": 1676 }, { "epoch": 0.104875, "grad_norm": 3.25, "grad_norm_var": 0.0369537353515625, "learning_rate": 0.0001, "loss": 8.6134, "loss/crossentropy": 2.2332391142845154, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.27132728695869446, "step": 1678 }, { "epoch": 0.105, "grad_norm": 3.359375, "grad_norm_var": 0.03421223958333333, "learning_rate": 0.0001, "loss": 8.5279, "loss/crossentropy": 2.0381821393966675, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.2683364152908325, "step": 1680 }, { "epoch": 0.105125, "grad_norm": 3.609375, "grad_norm_var": 0.16770731608072917, "learning_rate": 0.0001, "loss": 8.6381, "loss/crossentropy": 2.08440899848938, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.3021021783351898, "step": 1682 }, { "epoch": 0.10525, "grad_norm": 3.921875, "grad_norm_var": 0.18061421712239584, "learning_rate": 0.0001, "loss": 8.936, "loss/crossentropy": 2.247341275215149, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.33302658796310425, "step": 1684 }, { "epoch": 0.105375, "grad_norm": 3.6875, "grad_norm_var": 0.18000895182291668, "learning_rate": 0.0001, "loss": 8.7025, "loss/crossentropy": 2.487499713897705, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2774240970611572, "step": 1686 }, { "epoch": 0.1055, "grad_norm": 3.65625, "grad_norm_var": 0.1778717041015625, "learning_rate": 0.0001, "loss": 8.7795, "loss/crossentropy": 2.072964370250702, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.32281462848186493, "step": 1688 }, { "epoch": 0.105625, "grad_norm": 3.484375, "grad_norm_var": 0.17327067057291667, "learning_rate": 0.0001, "loss": 8.4966, "loss/crossentropy": 1.858900010585785, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.27107909321784973, "step": 1690 }, { "epoch": 0.10575, "grad_norm": 3.15625, "grad_norm_var": 0.17893473307291666, "learning_rate": 0.0001, "loss": 8.5945, "loss/crossentropy": 2.253313660621643, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.277107298374176, "step": 1692 }, { "epoch": 0.105875, "grad_norm": 3.828125, "grad_norm_var": 0.17164306640625, "learning_rate": 0.0001, "loss": 8.6327, "loss/crossentropy": 1.9608340859413147, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.28436803817749023, "step": 1694 }, { "epoch": 0.106, "grad_norm": 3.84375, "grad_norm_var": 0.16134440104166667, "learning_rate": 0.0001, "loss": 8.6966, "loss/crossentropy": 1.9271037578582764, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.26714108884334564, "step": 1696 }, { "epoch": 0.106125, "grad_norm": 6.0, "grad_norm_var": 0.4263834635416667, "learning_rate": 0.0001, "loss": 9.1171, "loss/crossentropy": 2.2318227291107178, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3112039566040039, "step": 1698 }, { "epoch": 0.10625, "grad_norm": 3.703125, "grad_norm_var": 0.4244466145833333, "learning_rate": 0.0001, "loss": 8.7063, "loss/crossentropy": 2.0639569759368896, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.2992531955242157, "step": 1700 }, { "epoch": 0.106375, "grad_norm": 16.5, "grad_norm_var": 10.641402180989584, "learning_rate": 0.0001, "loss": 9.2668, "loss/crossentropy": 2.3329110145568848, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.4682595729827881, "step": 1702 }, { "epoch": 0.1065, "grad_norm": 4.03125, "grad_norm_var": 10.505052693684895, "learning_rate": 0.0001, "loss": 8.7568, "loss/crossentropy": 2.453068733215332, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.2915901094675064, "step": 1704 }, { "epoch": 0.106625, "grad_norm": 3.5, "grad_norm_var": 10.39441630045573, "learning_rate": 0.0001, "loss": 8.7048, "loss/crossentropy": 2.0224732756614685, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.29426078498363495, "step": 1706 }, { "epoch": 0.10675, "grad_norm": 3.328125, "grad_norm_var": 10.381029256184895, "learning_rate": 0.0001, "loss": 8.4496, "loss/crossentropy": 2.1453932523727417, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.26775461435317993, "step": 1708 }, { "epoch": 0.106875, "grad_norm": 3.6875, "grad_norm_var": 10.417154947916666, "learning_rate": 0.0001, "loss": 8.9018, "loss/crossentropy": 2.601366400718689, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3308381289243698, "step": 1710 }, { "epoch": 0.107, "grad_norm": 3.296875, "grad_norm_var": 10.510835774739583, "learning_rate": 0.0001, "loss": 8.4869, "loss/crossentropy": 2.089142084121704, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.282400518655777, "step": 1712 }, { "epoch": 0.107125, "grad_norm": 3.328125, "grad_norm_var": 10.490428670247395, "learning_rate": 0.0001, "loss": 8.4865, "loss/crossentropy": 1.9690085053443909, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.28465139865875244, "step": 1714 }, { "epoch": 0.10725, "grad_norm": 3.234375, "grad_norm_var": 10.531078084309895, "learning_rate": 0.0001, "loss": 8.7701, "loss/crossentropy": 2.1551159620285034, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.261458195745945, "step": 1716 }, { "epoch": 0.107375, "grad_norm": 3.6875, "grad_norm_var": 0.12132059733072917, "learning_rate": 0.0001, "loss": 8.8407, "loss/crossentropy": 2.17106294631958, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.29966770112514496, "step": 1718 }, { "epoch": 0.1075, "grad_norm": 3.4375, "grad_norm_var": 0.05081380208333333, "learning_rate": 0.0001, "loss": 8.8806, "loss/crossentropy": 2.559879422187805, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.34499090909957886, "step": 1720 }, { "epoch": 0.107625, "grad_norm": 3.296875, "grad_norm_var": 0.07704671223958333, "learning_rate": 0.0001, "loss": 8.8094, "loss/crossentropy": 2.2468985319137573, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.31510044634342194, "step": 1722 }, { "epoch": 0.10775, "grad_norm": 3.21875, "grad_norm_var": 0.07730712890625, "learning_rate": 0.0001, "loss": 8.6919, "loss/crossentropy": 2.2554560899734497, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2784760594367981, "step": 1724 }, { "epoch": 0.107875, "grad_norm": 14.6875, "grad_norm_var": 7.846825154622396, "learning_rate": 0.0001, "loss": 9.0366, "loss/crossentropy": 2.1934194564819336, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.31857216358184814, "step": 1726 }, { "epoch": 0.108, "grad_norm": 4.3125, "grad_norm_var": 7.835798136393229, "learning_rate": 0.0001, "loss": 8.8942, "loss/crossentropy": 2.2181382179260254, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.28492943942546844, "step": 1728 }, { "epoch": 0.108125, "grad_norm": 3.28125, "grad_norm_var": 7.860835774739583, "learning_rate": 0.0001, "loss": 8.631, "loss/crossentropy": 2.178107976913452, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.2998335361480713, "step": 1730 }, { "epoch": 0.10825, "grad_norm": 4.0625, "grad_norm_var": 7.78082275390625, "learning_rate": 0.0001, "loss": 9.1469, "loss/crossentropy": 2.3030436038970947, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.30075204372406006, "step": 1732 }, { "epoch": 0.108375, "grad_norm": 4.125, "grad_norm_var": 7.778645833333333, "learning_rate": 0.0001, "loss": 9.0554, "loss/crossentropy": 2.4445960521698, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3125688135623932, "step": 1734 }, { "epoch": 0.1085, "grad_norm": 3.25, "grad_norm_var": 7.803348795572917, "learning_rate": 0.0001, "loss": 8.8836, "loss/crossentropy": 2.1892318725585938, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.27461107075214386, "step": 1736 }, { "epoch": 0.108625, "grad_norm": 3.421875, "grad_norm_var": 7.936099243164063, "learning_rate": 0.0001, "loss": 8.5065, "loss/crossentropy": 2.2882145643234253, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.28314919769763947, "step": 1738 }, { "epoch": 0.10875, "grad_norm": 3.359375, "grad_norm_var": 7.901162719726562, "learning_rate": 0.0001, "loss": 8.8148, "loss/crossentropy": 2.4665156602859497, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.298910915851593, "step": 1740 }, { "epoch": 0.108875, "grad_norm": 3.296875, "grad_norm_var": 0.4255035400390625, "learning_rate": 0.0001, "loss": 8.7002, "loss/crossentropy": 2.3757593631744385, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.26680710911750793, "step": 1742 }, { "epoch": 0.109, "grad_norm": 4.0625, "grad_norm_var": 0.11559244791666666, "learning_rate": 0.0001, "loss": 8.8274, "loss/crossentropy": 2.191728115081787, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.311319962143898, "step": 1744 }, { "epoch": 0.109125, "grad_norm": 3.59375, "grad_norm_var": 0.10793863932291667, "learning_rate": 0.0001, "loss": 9.0444, "loss/crossentropy": 2.593402147293091, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.29716330766677856, "step": 1746 }, { "epoch": 0.10925, "grad_norm": 3.3125, "grad_norm_var": 0.09600321451822917, "learning_rate": 0.0001, "loss": 8.379, "loss/crossentropy": 2.2119646072387695, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.3135145306587219, "step": 1748 }, { "epoch": 0.109375, "grad_norm": 6.09375, "grad_norm_var": 0.5061808268229167, "learning_rate": 0.0001, "loss": 8.9239, "loss/crossentropy": 2.3165996074676514, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3138580918312073, "step": 1750 }, { "epoch": 0.1095, "grad_norm": 3.703125, "grad_norm_var": 0.4987131754557292, "learning_rate": 0.0001, "loss": 8.5823, "loss/crossentropy": 2.1701733469963074, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.29495371878147125, "step": 1752 }, { "epoch": 0.109625, "grad_norm": 3.71875, "grad_norm_var": 0.4587473551432292, "learning_rate": 0.0001, "loss": 8.8967, "loss/crossentropy": 2.3956644535064697, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.29799969494342804, "step": 1754 }, { "epoch": 0.10975, "grad_norm": 3.296875, "grad_norm_var": 0.4662506103515625, "learning_rate": 0.0001, "loss": 8.695, "loss/crossentropy": 2.2648149132728577, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3074349910020828, "step": 1756 }, { "epoch": 0.109875, "grad_norm": 3.515625, "grad_norm_var": 0.455517578125, "learning_rate": 0.0001, "loss": 8.7087, "loss/crossentropy": 2.300851821899414, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3047354221343994, "step": 1758 }, { "epoch": 0.11, "grad_norm": 3.234375, "grad_norm_var": 0.45696614583333334, "learning_rate": 0.0001, "loss": 8.605, "loss/crossentropy": 2.0940895080566406, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.25672924518585205, "step": 1760 }, { "epoch": 0.110125, "grad_norm": 3.578125, "grad_norm_var": 0.45799153645833335, "learning_rate": 0.0001, "loss": 8.7684, "loss/crossentropy": 2.095521330833435, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.29457685351371765, "step": 1762 }, { "epoch": 0.11025, "grad_norm": 3.765625, "grad_norm_var": 0.43875325520833336, "learning_rate": 0.0001, "loss": 9.1227, "loss/crossentropy": 2.3666045665740967, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2777389585971832, "step": 1764 }, { "epoch": 0.110375, "grad_norm": 3.515625, "grad_norm_var": 0.033812459309895834, "learning_rate": 0.0001, "loss": 8.6011, "loss/crossentropy": 2.305943012237549, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.293687641620636, "step": 1766 }, { "epoch": 0.1105, "grad_norm": 4.3125, "grad_norm_var": 0.06910400390625, "learning_rate": 0.0001, "loss": 8.773, "loss/crossentropy": 2.26338267326355, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.30263902246952057, "step": 1768 }, { "epoch": 0.110625, "grad_norm": 3.625, "grad_norm_var": 0.29260660807291666, "learning_rate": 0.0001, "loss": 8.6503, "loss/crossentropy": 2.1858354806900024, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3575718253850937, "step": 1770 }, { "epoch": 0.11075, "grad_norm": 3.65625, "grad_norm_var": 0.27864481608072916, "learning_rate": 0.0001, "loss": 8.8443, "loss/crossentropy": 2.3129689693450928, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.27823805809020996, "step": 1772 }, { "epoch": 0.110875, "grad_norm": 3.3125, "grad_norm_var": 0.27697652180989585, "learning_rate": 0.0001, "loss": 8.5938, "loss/crossentropy": 2.5259759426116943, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.33172979950904846, "step": 1774 }, { "epoch": 0.111, "grad_norm": 3.359375, "grad_norm_var": 0.26672261555989585, "learning_rate": 0.0001, "loss": 8.879, "loss/crossentropy": 2.4290276765823364, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3110189586877823, "step": 1776 }, { "epoch": 0.111125, "grad_norm": 3.515625, "grad_norm_var": 0.27392171223958334, "learning_rate": 0.0001, "loss": 8.7915, "loss/crossentropy": 2.3960351943969727, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.30158601701259613, "step": 1778 }, { "epoch": 0.11125, "grad_norm": 3.109375, "grad_norm_var": 0.3003163655598958, "learning_rate": 0.0001, "loss": 8.5199, "loss/crossentropy": 2.082271099090576, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.27362723648548126, "step": 1780 }, { "epoch": 0.111375, "grad_norm": 4.0, "grad_norm_var": 0.30058186848958335, "learning_rate": 0.0001, "loss": 8.9533, "loss/crossentropy": 2.1544870138168335, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3307036906480789, "step": 1782 }, { "epoch": 0.1115, "grad_norm": 3.34375, "grad_norm_var": 0.2777984619140625, "learning_rate": 0.0001, "loss": 8.7576, "loss/crossentropy": 2.44951331615448, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.309871181845665, "step": 1784 }, { "epoch": 0.111625, "grad_norm": 3.5, "grad_norm_var": 0.045182291666666666, "learning_rate": 0.0001, "loss": 8.6303, "loss/crossentropy": 2.1309146881103516, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2824729233980179, "step": 1786 }, { "epoch": 0.11175, "grad_norm": 3.5, "grad_norm_var": 0.04435933430989583, "learning_rate": 0.0001, "loss": 8.5938, "loss/crossentropy": 2.233055591583252, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.30681729316711426, "step": 1788 }, { "epoch": 0.111875, "grad_norm": 3.46875, "grad_norm_var": 0.04644775390625, "learning_rate": 0.0001, "loss": 8.7359, "loss/crossentropy": 2.0256340503692627, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2636297792196274, "step": 1790 }, { "epoch": 0.112, "grad_norm": 3.296875, "grad_norm_var": 0.04838765462239583, "learning_rate": 0.0001, "loss": 8.9568, "loss/crossentropy": 2.048343300819397, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.4137808680534363, "step": 1792 }, { "epoch": 0.112125, "grad_norm": 4.1875, "grad_norm_var": 0.08330078125, "learning_rate": 0.0001, "loss": 8.8021, "loss/crossentropy": 2.2039687633514404, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3387700319290161, "step": 1794 }, { "epoch": 0.11225, "grad_norm": 4.125, "grad_norm_var": 0.0920562744140625, "learning_rate": 0.0001, "loss": 8.6473, "loss/crossentropy": 2.3665820360183716, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2901700288057327, "step": 1796 }, { "epoch": 0.112375, "grad_norm": 3.765625, "grad_norm_var": 0.08447265625, "learning_rate": 0.0001, "loss": 8.6925, "loss/crossentropy": 2.00313001871109, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28618834912776947, "step": 1798 }, { "epoch": 0.1125, "grad_norm": 3.46875, "grad_norm_var": 0.07996419270833334, "learning_rate": 0.0001, "loss": 8.6164, "loss/crossentropy": 2.475511074066162, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30172815918922424, "step": 1800 }, { "epoch": 0.112625, "grad_norm": 3.21875, "grad_norm_var": 0.08925679524739584, "learning_rate": 0.0001, "loss": 8.7688, "loss/crossentropy": 2.4699066877365112, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2895689606666565, "step": 1802 }, { "epoch": 0.11275, "grad_norm": 3.21875, "grad_norm_var": 0.09072265625, "learning_rate": 0.0001, "loss": 8.7812, "loss/crossentropy": 2.2400593757629395, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.28233984112739563, "step": 1804 }, { "epoch": 0.112875, "grad_norm": 3.90625, "grad_norm_var": 0.10087788899739583, "learning_rate": 0.0001, "loss": 8.7431, "loss/crossentropy": 2.3116856813430786, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.306411549448967, "step": 1806 }, { "epoch": 0.113, "grad_norm": 4.0625, "grad_norm_var": 0.12112528483072917, "learning_rate": 0.0001, "loss": 8.7914, "loss/crossentropy": 2.133602797985077, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.28208571672439575, "step": 1808 }, { "epoch": 0.113125, "grad_norm": 3.28125, "grad_norm_var": 0.12552083333333333, "learning_rate": 0.0001, "loss": 8.5299, "loss/crossentropy": 2.2401410341262817, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.29185375571250916, "step": 1810 }, { "epoch": 0.11325, "grad_norm": 4.53125, "grad_norm_var": 0.157666015625, "learning_rate": 0.0001, "loss": 8.4904, "loss/crossentropy": 1.93543541431427, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3017665445804596, "step": 1812 }, { "epoch": 0.113375, "grad_norm": 3.5, "grad_norm_var": 0.15843098958333332, "learning_rate": 0.0001, "loss": 8.7415, "loss/crossentropy": 2.243703246116638, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2688685953617096, "step": 1814 }, { "epoch": 0.1135, "grad_norm": 3.328125, "grad_norm_var": 0.16135660807291666, "learning_rate": 0.0001, "loss": 8.5466, "loss/crossentropy": 2.333595037460327, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2719716727733612, "step": 1816 }, { "epoch": 0.113625, "grad_norm": 3.8125, "grad_norm_var": 0.17988179524739584, "learning_rate": 0.0001, "loss": 8.6006, "loss/crossentropy": 1.8939169049263, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.24589256942272186, "step": 1818 }, { "epoch": 0.11375, "grad_norm": 3.953125, "grad_norm_var": 0.17858784993489582, "learning_rate": 0.0001, "loss": 8.9438, "loss/crossentropy": 2.455257773399353, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.2882642447948456, "step": 1820 }, { "epoch": 0.113875, "grad_norm": 5.53125, "grad_norm_var": 0.39384358723958335, "learning_rate": 0.0001, "loss": 9.071, "loss/crossentropy": 2.2096447944641113, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3491516262292862, "step": 1822 }, { "epoch": 0.114, "grad_norm": 3.21875, "grad_norm_var": 0.37707417805989585, "learning_rate": 0.0001, "loss": 8.5158, "loss/crossentropy": 2.08541738986969, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.2686326503753662, "step": 1824 }, { "epoch": 0.114125, "grad_norm": 3.765625, "grad_norm_var": 0.3614908854166667, "learning_rate": 0.0001, "loss": 8.6355, "loss/crossentropy": 2.0728381276130676, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3196987956762314, "step": 1826 }, { "epoch": 0.11425, "grad_norm": 3.171875, "grad_norm_var": 0.32328999837239586, "learning_rate": 0.0001, "loss": 8.6977, "loss/crossentropy": 2.4653478860855103, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3111522048711777, "step": 1828 }, { "epoch": 0.114375, "grad_norm": 6.53125, "grad_norm_var": 0.8534088134765625, "learning_rate": 0.0001, "loss": 9.0379, "loss/crossentropy": 2.472045063972473, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.28499045968055725, "step": 1830 }, { "epoch": 0.1145, "grad_norm": 3.9375, "grad_norm_var": 0.8278472900390625, "learning_rate": 0.0001, "loss": 8.3966, "loss/crossentropy": 2.365163564682007, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30011722445487976, "step": 1832 }, { "epoch": 0.114625, "grad_norm": 3.21875, "grad_norm_var": 0.8778279622395834, "learning_rate": 0.0001, "loss": 8.6259, "loss/crossentropy": 2.221544623374939, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.27982884645462036, "step": 1834 }, { "epoch": 0.11475, "grad_norm": 3.5, "grad_norm_var": 0.8932576497395833, "learning_rate": 0.0001, "loss": 8.586, "loss/crossentropy": 2.248165249824524, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2916170507669449, "step": 1836 }, { "epoch": 0.114875, "grad_norm": 3.703125, "grad_norm_var": 0.7109202067057292, "learning_rate": 0.0001, "loss": 8.7218, "loss/crossentropy": 2.449094533920288, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2794910967350006, "step": 1838 }, { "epoch": 0.115, "grad_norm": 3.921875, "grad_norm_var": 0.6978830973307292, "learning_rate": 0.0001, "loss": 9.1572, "loss/crossentropy": 2.3719322681427, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.32727518677711487, "step": 1840 }, { "epoch": 0.115125, "grad_norm": 4.09375, "grad_norm_var": 0.7026652018229167, "learning_rate": 0.0001, "loss": 8.8725, "loss/crossentropy": 2.5066757202148438, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.26487046480178833, "step": 1842 }, { "epoch": 0.11525, "grad_norm": 3.59375, "grad_norm_var": 0.6559855143229166, "learning_rate": 0.0001, "loss": 8.6606, "loss/crossentropy": 2.0091428756713867, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.26702386140823364, "step": 1844 }, { "epoch": 0.115375, "grad_norm": 3.765625, "grad_norm_var": 0.18411051432291667, "learning_rate": 0.0001, "loss": 8.6167, "loss/crossentropy": 2.268906354904175, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2717943787574768, "step": 1846 }, { "epoch": 0.1155, "grad_norm": 3.078125, "grad_norm_var": 0.2439453125, "learning_rate": 0.0001, "loss": 8.5006, "loss/crossentropy": 2.1651817560195923, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2848515063524246, "step": 1848 }, { "epoch": 0.115625, "grad_norm": 3.484375, "grad_norm_var": 0.20624898274739584, "learning_rate": 0.0001, "loss": 8.8499, "loss/crossentropy": 2.2831424474716187, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2911527752876282, "step": 1850 }, { "epoch": 0.11575, "grad_norm": 4.28125, "grad_norm_var": 0.21565348307291668, "learning_rate": 0.0001, "loss": 8.6357, "loss/crossentropy": 2.132005453109741, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.26818957924842834, "step": 1852 }, { "epoch": 0.115875, "grad_norm": 3.484375, "grad_norm_var": 0.2232086181640625, "learning_rate": 0.0001, "loss": 8.6977, "loss/crossentropy": 2.406746983528137, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.28900858759880066, "step": 1854 }, { "epoch": 0.116, "grad_norm": 3.515625, "grad_norm_var": 0.21317952473958332, "learning_rate": 0.0001, "loss": 8.9313, "loss/crossentropy": 2.220999240875244, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.28174880146980286, "step": 1856 }, { "epoch": 0.116125, "grad_norm": 3.046875, "grad_norm_var": 0.21760965983072916, "learning_rate": 0.0001, "loss": 8.724, "loss/crossentropy": 2.195012092590332, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.26727311313152313, "step": 1858 }, { "epoch": 0.11625, "grad_norm": 3.140625, "grad_norm_var": 0.23176167805989584, "learning_rate": 0.0001, "loss": 8.82, "loss/crossentropy": 2.5875377655029297, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.30315685272216797, "step": 1860 }, { "epoch": 0.116375, "grad_norm": 3.390625, "grad_norm_var": 0.23212788899739584, "learning_rate": 0.0001, "loss": 8.7209, "loss/crossentropy": 2.4106770753860474, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.283745214343071, "step": 1862 }, { "epoch": 0.1165, "grad_norm": 3.359375, "grad_norm_var": 0.2173736572265625, "learning_rate": 0.0001, "loss": 8.7897, "loss/crossentropy": 2.1879663467407227, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2908755838871002, "step": 1864 }, { "epoch": 0.116625, "grad_norm": 6.40625, "grad_norm_var": 0.6588216145833333, "learning_rate": 0.0001, "loss": 8.5523, "loss/crossentropy": 2.3278889656066895, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.24966130405664444, "step": 1866 }, { "epoch": 0.11675, "grad_norm": 3.984375, "grad_norm_var": 0.6225545247395833, "learning_rate": 0.0001, "loss": 9.0472, "loss/crossentropy": 2.3500643968582153, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3106984347105026, "step": 1868 }, { "epoch": 0.116875, "grad_norm": 5.34375, "grad_norm_var": 0.8042633056640625, "learning_rate": 0.0001, "loss": 8.9806, "loss/crossentropy": 2.6106945276260376, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.31557662785053253, "step": 1870 }, { "epoch": 0.117, "grad_norm": 4.0625, "grad_norm_var": 0.8073201497395833, "learning_rate": 0.0001, "loss": 8.9802, "loss/crossentropy": 2.4700088500976562, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28965385258197784, "step": 1872 }, { "epoch": 0.117125, "grad_norm": 4.28125, "grad_norm_var": 0.7607381184895833, "learning_rate": 0.0001, "loss": 8.6394, "loss/crossentropy": 2.478583335876465, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2889174222946167, "step": 1874 }, { "epoch": 0.11725, "grad_norm": 4.375, "grad_norm_var": 0.7517812093098958, "learning_rate": 0.0001, "loss": 9.1359, "loss/crossentropy": 2.443656802177429, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3275313228368759, "step": 1876 }, { "epoch": 0.117375, "grad_norm": 3.609375, "grad_norm_var": 0.7418253580729167, "learning_rate": 0.0001, "loss": 8.6034, "loss/crossentropy": 2.3004201650619507, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2772568315267563, "step": 1878 }, { "epoch": 0.1175, "grad_norm": 3.640625, "grad_norm_var": 0.6761383056640625, "learning_rate": 0.0001, "loss": 8.8082, "loss/crossentropy": 2.2469045519828796, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3228066712617874, "step": 1880 }, { "epoch": 0.117625, "grad_norm": 4.25, "grad_norm_var": 0.2380279541015625, "learning_rate": 0.0001, "loss": 8.8525, "loss/crossentropy": 2.419469714164734, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.29159781336784363, "step": 1882 }, { "epoch": 0.11775, "grad_norm": 3.03125, "grad_norm_var": 0.3133626302083333, "learning_rate": 0.0001, "loss": 8.639, "loss/crossentropy": 2.3337111473083496, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.25790637731552124, "step": 1884 }, { "epoch": 0.117875, "grad_norm": 3.34375, "grad_norm_var": 0.19654032389322917, "learning_rate": 0.0001, "loss": 8.4541, "loss/crossentropy": 2.2113906145095825, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.29410840570926666, "step": 1886 }, { "epoch": 0.118, "grad_norm": 3.375, "grad_norm_var": 0.19166666666666668, "learning_rate": 0.0001, "loss": 8.7409, "loss/crossentropy": 2.274120569229126, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.29355761408805847, "step": 1888 }, { "epoch": 0.118125, "grad_norm": 3.296875, "grad_norm_var": 0.17994791666666668, "learning_rate": 0.0001, "loss": 8.4548, "loss/crossentropy": 2.090532958507538, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.32575349509716034, "step": 1890 }, { "epoch": 0.11825, "grad_norm": 3.765625, "grad_norm_var": 0.1302398681640625, "learning_rate": 0.0001, "loss": 8.894, "loss/crossentropy": 2.48271107673645, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.29649099707603455, "step": 1892 }, { "epoch": 0.118375, "grad_norm": 4.09375, "grad_norm_var": 0.15208333333333332, "learning_rate": 0.0001, "loss": 8.5682, "loss/crossentropy": 2.1585158109664917, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.24566738307476044, "step": 1894 }, { "epoch": 0.1185, "grad_norm": 3.4375, "grad_norm_var": 0.15894266764322917, "learning_rate": 0.0001, "loss": 8.5728, "loss/crossentropy": 2.22609806060791, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.27473118901252747, "step": 1896 }, { "epoch": 0.118625, "grad_norm": 6.9375, "grad_norm_var": 0.8718739827473958, "learning_rate": 0.0001, "loss": 9.0188, "loss/crossentropy": 2.3936339616775513, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.42816175520420074, "step": 1898 }, { "epoch": 0.11875, "grad_norm": 3.421875, "grad_norm_var": 0.8406809488932292, "learning_rate": 0.0001, "loss": 8.5265, "loss/crossentropy": 2.333956003189087, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.28808265924453735, "step": 1900 }, { "epoch": 0.118875, "grad_norm": 3.296875, "grad_norm_var": 0.8180826822916667, "learning_rate": 0.0001, "loss": 8.5573, "loss/crossentropy": 1.9834295511245728, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3038376271724701, "step": 1902 }, { "epoch": 0.119, "grad_norm": 4.46875, "grad_norm_var": 1.80859375, "learning_rate": 0.0001, "loss": 9.3781, "loss/crossentropy": 2.3898226022720337, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.4111027866601944, "step": 1904 }, { "epoch": 0.119125, "grad_norm": 3.578125, "grad_norm_var": 1.751301066080729, "learning_rate": 0.0001, "loss": 8.7397, "loss/crossentropy": 2.14963698387146, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.27916258573532104, "step": 1906 }, { "epoch": 0.11925, "grad_norm": 3.078125, "grad_norm_var": 1.851707967122396, "learning_rate": 0.0001, "loss": 8.3666, "loss/crossentropy": 1.9682239890098572, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.27202266454696655, "step": 1908 }, { "epoch": 0.119375, "grad_norm": 3.453125, "grad_norm_var": 1.8763824462890626, "learning_rate": 0.0001, "loss": 8.6178, "loss/crossentropy": 2.1532257795333862, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2853679060935974, "step": 1910 }, { "epoch": 0.1195, "grad_norm": 3.265625, "grad_norm_var": 1.8948567708333333, "learning_rate": 0.0001, "loss": 8.7041, "loss/crossentropy": 1.927408516407013, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2693905234336853, "step": 1912 }, { "epoch": 0.119625, "grad_norm": 4.25, "grad_norm_var": 1.2513671875, "learning_rate": 0.0001, "loss": 8.4468, "loss/crossentropy": 2.3485519886016846, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29178424179553986, "step": 1914 }, { "epoch": 0.11975, "grad_norm": 3.203125, "grad_norm_var": 1.27554931640625, "learning_rate": 0.0001, "loss": 8.335, "loss/crossentropy": 2.2744147777557373, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.27281489968299866, "step": 1916 }, { "epoch": 0.119875, "grad_norm": 3.015625, "grad_norm_var": 1.3215321858723958, "learning_rate": 0.0001, "loss": 8.4879, "loss/crossentropy": 2.2114824056625366, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2593921273946762, "step": 1918 }, { "epoch": 0.12, "grad_norm": 3.15625, "grad_norm_var": 0.15273030598958334, "learning_rate": 0.0001, "loss": 8.3128, "loss/crossentropy": 1.89790540933609, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.25455522537231445, "step": 1920 }, { "epoch": 0.120125, "grad_norm": 3.15625, "grad_norm_var": 0.15286458333333333, "learning_rate": 0.0001, "loss": 8.4771, "loss/crossentropy": 2.229190468788147, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.2908050864934921, "step": 1922 }, { "epoch": 0.12025, "grad_norm": 3.859375, "grad_norm_var": 0.1630859375, "learning_rate": 0.0001, "loss": 8.5646, "loss/crossentropy": 2.3038084506988525, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.27247732132673264, "step": 1924 }, { "epoch": 0.120375, "grad_norm": 3.296875, "grad_norm_var": 0.1736724853515625, "learning_rate": 0.0001, "loss": 8.7061, "loss/crossentropy": 1.9715904593467712, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.32534220814704895, "step": 1926 }, { "epoch": 0.1205, "grad_norm": 3.40625, "grad_norm_var": 0.16855367024739584, "learning_rate": 0.0001, "loss": 8.4684, "loss/crossentropy": 2.3967236280441284, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3001554310321808, "step": 1928 }, { "epoch": 0.120625, "grad_norm": 6.8125, "grad_norm_var": 0.8520985921223958, "learning_rate": 0.0001, "loss": 9.2296, "loss/crossentropy": 2.2211365699768066, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.29438023269176483, "step": 1930 }, { "epoch": 0.12075, "grad_norm": 3.328125, "grad_norm_var": 0.8120351155598958, "learning_rate": 0.0001, "loss": 8.3795, "loss/crossentropy": 1.7906777262687683, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2534093111753464, "step": 1932 }, { "epoch": 0.120875, "grad_norm": 4.53125, "grad_norm_var": 0.83876953125, "learning_rate": 0.0001, "loss": 8.6791, "loss/crossentropy": 2.0840158462524414, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.3184715062379837, "step": 1934 }, { "epoch": 0.121, "grad_norm": 3.828125, "grad_norm_var": 0.87467041015625, "learning_rate": 0.0001, "loss": 8.7778, "loss/crossentropy": 2.0006986260414124, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.25994736701250076, "step": 1936 }, { "epoch": 0.121125, "grad_norm": 3.21875, "grad_norm_var": 0.8491200764973958, "learning_rate": 0.0001, "loss": 8.4596, "loss/crossentropy": 2.0004652738571167, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.22842344641685486, "step": 1938 }, { "epoch": 0.12125, "grad_norm": 3.359375, "grad_norm_var": 0.8299479166666667, "learning_rate": 0.0001, "loss": 8.7803, "loss/crossentropy": 2.550653338432312, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2856984883546829, "step": 1940 }, { "epoch": 0.121375, "grad_norm": 3.28125, "grad_norm_var": 0.8327870686848958, "learning_rate": 0.0001, "loss": 8.5344, "loss/crossentropy": 2.1163352727890015, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.27338388562202454, "step": 1942 }, { "epoch": 0.1215, "grad_norm": 3.359375, "grad_norm_var": 0.8302042643229167, "learning_rate": 0.0001, "loss": 8.7621, "loss/crossentropy": 2.2076683044433594, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.27183353900909424, "step": 1944 }, { "epoch": 0.121625, "grad_norm": 3.203125, "grad_norm_var": 0.23727213541666667, "learning_rate": 0.0001, "loss": 8.0684, "loss/crossentropy": 1.9610223174095154, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2562180161476135, "step": 1946 }, { "epoch": 0.12175, "grad_norm": 3.625, "grad_norm_var": 0.25097554524739585, "learning_rate": 0.0001, "loss": 8.5389, "loss/crossentropy": 2.194198727607727, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.3487841486930847, "step": 1948 }, { "epoch": 0.121875, "grad_norm": 4.375, "grad_norm_var": 0.22975972493489583, "learning_rate": 0.0001, "loss": 8.7646, "loss/crossentropy": 2.3437373638153076, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.32154756784439087, "step": 1950 }, { "epoch": 0.122, "grad_norm": 5.96875, "grad_norm_var": 0.5081868489583333, "learning_rate": 0.0001, "loss": 8.6219, "loss/crossentropy": 2.16952908039093, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.29443131387233734, "step": 1952 }, { "epoch": 0.122125, "grad_norm": 5.71875, "grad_norm_var": 1.9670806884765626, "learning_rate": 0.0001, "loss": 9.1857, "loss/crossentropy": 2.402138590812683, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3063973933458328, "step": 1954 }, { "epoch": 0.12225, "grad_norm": 3.5, "grad_norm_var": 1.9675089518229167, "learning_rate": 0.0001, "loss": 9.1832, "loss/crossentropy": 2.433066964149475, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3420894145965576, "step": 1956 }, { "epoch": 0.122375, "grad_norm": 3.578125, "grad_norm_var": 1.979572550455729, "learning_rate": 0.0001, "loss": 8.393, "loss/crossentropy": 2.197170615196228, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2728285938501358, "step": 1958 }, { "epoch": 0.1225, "grad_norm": 3.203125, "grad_norm_var": 2.018635050455729, "learning_rate": 0.0001, "loss": 8.4519, "loss/crossentropy": 2.4277193546295166, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2740899324417114, "step": 1960 }, { "epoch": 0.122625, "grad_norm": 3.546875, "grad_norm_var": 1.98818359375, "learning_rate": 0.0001, "loss": 8.3599, "loss/crossentropy": 1.8049125671386719, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.25449274480342865, "step": 1962 }, { "epoch": 0.12275, "grad_norm": 4.8125, "grad_norm_var": 1.9446126302083333, "learning_rate": 0.0001, "loss": 8.4081, "loss/crossentropy": 2.158496618270874, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3127806931734085, "step": 1964 }, { "epoch": 0.122875, "grad_norm": 3.5625, "grad_norm_var": 1.95035400390625, "learning_rate": 0.0001, "loss": 8.6138, "loss/crossentropy": 2.09112149477005, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.3012082725763321, "step": 1966 }, { "epoch": 0.123, "grad_norm": 3.421875, "grad_norm_var": 1.7016276041666667, "learning_rate": 0.0001, "loss": 8.2905, "loss/crossentropy": 2.042824864387512, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.257434219121933, "step": 1968 }, { "epoch": 0.123125, "grad_norm": 3.078125, "grad_norm_var": 0.17531636555989583, "learning_rate": 0.0001, "loss": 8.3934, "loss/crossentropy": 2.7338595390319824, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.26814499497413635, "step": 1970 }, { "epoch": 0.12325, "grad_norm": 3.109375, "grad_norm_var": 0.19435933430989583, "learning_rate": 0.0001, "loss": 8.3918, "loss/crossentropy": 2.137987196445465, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.29963263869285583, "step": 1972 }, { "epoch": 0.123375, "grad_norm": 3.953125, "grad_norm_var": 0.2551015218098958, "learning_rate": 0.0001, "loss": 9.0546, "loss/crossentropy": 2.462228536605835, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3093663305044174, "step": 1974 }, { "epoch": 0.1235, "grad_norm": 3.4375, "grad_norm_var": 0.24345296223958332, "learning_rate": 0.0001, "loss": 8.5238, "loss/crossentropy": 2.2877787351608276, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3022351413965225, "step": 1976 }, { "epoch": 0.123625, "grad_norm": 3.15625, "grad_norm_var": 0.26203511555989584, "learning_rate": 0.0001, "loss": 8.5626, "loss/crossentropy": 2.2335957288742065, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28036345541477203, "step": 1978 }, { "epoch": 0.12375, "grad_norm": 3.359375, "grad_norm_var": 0.15058186848958333, "learning_rate": 0.0001, "loss": 8.7114, "loss/crossentropy": 2.0434305667877197, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.26071667671203613, "step": 1980 }, { "epoch": 0.123875, "grad_norm": 3.3125, "grad_norm_var": 0.1507720947265625, "learning_rate": 0.0001, "loss": 8.4172, "loss/crossentropy": 1.9536693692207336, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.27602314949035645, "step": 1982 }, { "epoch": 0.124, "grad_norm": 3.296875, "grad_norm_var": 0.15515034993489582, "learning_rate": 0.0001, "loss": 8.1532, "loss/crossentropy": 2.156287908554077, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2685669958591461, "step": 1984 }, { "epoch": 0.124125, "grad_norm": 3.296875, "grad_norm_var": 0.15143229166666666, "learning_rate": 0.0001, "loss": 8.7167, "loss/crossentropy": 1.9938928484916687, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2635682225227356, "step": 1986 }, { "epoch": 0.12425, "grad_norm": 3.59375, "grad_norm_var": 0.13426005045572917, "learning_rate": 0.0001, "loss": 8.7086, "loss/crossentropy": 2.242967128753662, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.28749316930770874, "step": 1988 }, { "epoch": 0.124375, "grad_norm": 3.921875, "grad_norm_var": 0.0980377197265625, "learning_rate": 0.0001, "loss": 8.441, "loss/crossentropy": 2.2426506876945496, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.278358519077301, "step": 1990 }, { "epoch": 0.1245, "grad_norm": 3.421875, "grad_norm_var": 0.09846598307291667, "learning_rate": 0.0001, "loss": 8.7239, "loss/crossentropy": 2.1755125522613525, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.271381139755249, "step": 1992 }, { "epoch": 0.124625, "grad_norm": 2.953125, "grad_norm_var": 0.1131744384765625, "learning_rate": 0.0001, "loss": 8.2185, "loss/crossentropy": 2.1989212036132812, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2699214518070221, "step": 1994 }, { "epoch": 0.12475, "grad_norm": 3.25, "grad_norm_var": 0.08835347493489583, "learning_rate": 0.0001, "loss": 8.6419, "loss/crossentropy": 2.1517637968063354, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2819422334432602, "step": 1996 }, { "epoch": 0.124875, "grad_norm": 3.34375, "grad_norm_var": 0.09705301920572916, "learning_rate": 0.0001, "loss": 8.4251, "loss/crossentropy": 2.1624850034713745, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27621982991695404, "step": 1998 }, { "epoch": 0.125, "grad_norm": 4.15625, "grad_norm_var": 0.15205790201822916, "learning_rate": 0.0001, "loss": 8.5759, "loss/crossentropy": 2.1453830003738403, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3055148124694824, "step": 2000 }, { "epoch": 0.125125, "grad_norm": 3.59375, "grad_norm_var": 0.16303609212239584, "learning_rate": 0.0001, "loss": 8.4771, "loss/crossentropy": 2.1015294790267944, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.25279899686574936, "step": 2002 }, { "epoch": 0.12525, "grad_norm": 4.5, "grad_norm_var": 0.22411702473958334, "learning_rate": 0.0001, "loss": 8.7649, "loss/crossentropy": 2.175279974937439, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2848304957151413, "step": 2004 }, { "epoch": 0.125375, "grad_norm": 3.234375, "grad_norm_var": 0.28280843098958336, "learning_rate": 0.0001, "loss": 8.5977, "loss/crossentropy": 2.1618118286132812, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2875328063964844, "step": 2006 }, { "epoch": 0.1255, "grad_norm": 3.78125, "grad_norm_var": 0.30917561848958336, "learning_rate": 0.0001, "loss": 9.2233, "loss/crossentropy": 2.391259789466858, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3345881402492523, "step": 2008 }, { "epoch": 0.125625, "grad_norm": 3.265625, "grad_norm_var": 0.26471354166666666, "learning_rate": 0.0001, "loss": 8.4709, "loss/crossentropy": 2.342541813850403, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2817014455795288, "step": 2010 }, { "epoch": 0.12575, "grad_norm": 3.390625, "grad_norm_var": 0.2509104410807292, "learning_rate": 0.0001, "loss": 8.8183, "loss/crossentropy": 2.527478814125061, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2881559580564499, "step": 2012 }, { "epoch": 0.125875, "grad_norm": 3.890625, "grad_norm_var": 0.19236551920572917, "learning_rate": 0.0001, "loss": 8.9042, "loss/crossentropy": 2.19675874710083, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3004467785358429, "step": 2014 }, { "epoch": 0.126, "grad_norm": 2.984375, "grad_norm_var": 0.25149332682291664, "learning_rate": 0.0001, "loss": 8.2422, "loss/crossentropy": 2.175578236579895, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2675127685070038, "step": 2016 }, { "epoch": 0.126125, "grad_norm": 3.796875, "grad_norm_var": 0.2751261393229167, "learning_rate": 0.0001, "loss": 8.803, "loss/crossentropy": 2.271016001701355, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3039353936910629, "step": 2018 }, { "epoch": 0.12625, "grad_norm": 3.46875, "grad_norm_var": 0.23128255208333334, "learning_rate": 0.0001, "loss": 8.3209, "loss/crossentropy": 1.908856749534607, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2831180691719055, "step": 2020 }, { "epoch": 0.126375, "grad_norm": 3.046875, "grad_norm_var": 0.17986551920572916, "learning_rate": 0.0001, "loss": 8.3486, "loss/crossentropy": 2.367830276489258, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2986317425966263, "step": 2022 }, { "epoch": 0.1265, "grad_norm": 3.78125, "grad_norm_var": 0.13924153645833334, "learning_rate": 0.0001, "loss": 8.4988, "loss/crossentropy": 2.4253504276275635, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2815774828195572, "step": 2024 }, { "epoch": 0.126625, "grad_norm": 3.1875, "grad_norm_var": 0.08665364583333333, "learning_rate": 0.0001, "loss": 8.5443, "loss/crossentropy": 2.2791528701782227, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.27285344898700714, "step": 2026 }, { "epoch": 0.12675, "grad_norm": 7.0625, "grad_norm_var": 0.9373006184895833, "learning_rate": 0.0001, "loss": 8.77, "loss/crossentropy": 1.958154559135437, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.25279736518859863, "step": 2028 }, { "epoch": 0.126875, "grad_norm": 3.375, "grad_norm_var": 0.9443318684895833, "learning_rate": 0.0001, "loss": 8.6024, "loss/crossentropy": 2.350911259651184, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.2887439876794815, "step": 2030 }, { "epoch": 0.127, "grad_norm": 3.390625, "grad_norm_var": 1.3159993489583333, "learning_rate": 0.0001, "loss": 8.7633, "loss/crossentropy": 2.1270240545272827, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.26917576789855957, "step": 2032 }, { "epoch": 0.127125, "grad_norm": 3.203125, "grad_norm_var": 1.31353759765625, "learning_rate": 0.0001, "loss": 8.9863, "loss/crossentropy": 2.513992667198181, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3112826496362686, "step": 2034 }, { "epoch": 0.12725, "grad_norm": 3.03125, "grad_norm_var": 1.353416951497396, "learning_rate": 0.0001, "loss": 8.6818, "loss/crossentropy": 2.295058488845825, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3167324960231781, "step": 2036 }, { "epoch": 0.127375, "grad_norm": 3.125, "grad_norm_var": 1.346240234375, "learning_rate": 0.0001, "loss": 8.6107, "loss/crossentropy": 2.483068585395813, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2947251796722412, "step": 2038 }, { "epoch": 0.1275, "grad_norm": 3.5625, "grad_norm_var": 2.343317667643229, "learning_rate": 0.0001, "loss": 8.2658, "loss/crossentropy": 2.137487053871155, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.25359949469566345, "step": 2040 }, { "epoch": 0.127625, "grad_norm": 2.890625, "grad_norm_var": 2.379719034830729, "learning_rate": 0.0001, "loss": 8.5041, "loss/crossentropy": 2.1284241676330566, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.26821649074554443, "step": 2042 }, { "epoch": 0.12775, "grad_norm": 3.296875, "grad_norm_var": 1.7020345052083334, "learning_rate": 0.0001, "loss": 8.5597, "loss/crossentropy": 2.4801424741744995, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2893804758787155, "step": 2044 }, { "epoch": 0.127875, "grad_norm": 3.34375, "grad_norm_var": 1.70904541015625, "learning_rate": 0.0001, "loss": 8.5726, "loss/crossentropy": 2.253470778465271, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3076936900615692, "step": 2046 }, { "epoch": 0.128, "grad_norm": 3.046875, "grad_norm_var": 1.298591105143229, "learning_rate": 0.0001, "loss": 8.2338, "loss/crossentropy": 2.0723642110824585, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.256762757897377, "step": 2048 }, { "epoch": 0.128125, "grad_norm": 3.640625, "grad_norm_var": 1.2935048421223958, "learning_rate": 0.0001, "loss": 8.5358, "loss/crossentropy": 2.2424747943878174, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28199443221092224, "step": 2050 }, { "epoch": 0.12825, "grad_norm": 3.34375, "grad_norm_var": 1.2793253580729167, "learning_rate": 0.0001, "loss": 8.4589, "loss/crossentropy": 2.0916874408721924, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.262145534157753, "step": 2052 }, { "epoch": 0.128375, "grad_norm": 2.953125, "grad_norm_var": 1.3067291259765625, "learning_rate": 0.0001, "loss": 8.1382, "loss/crossentropy": 2.000722289085388, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.25160059332847595, "step": 2054 }, { "epoch": 0.1285, "grad_norm": 3.390625, "grad_norm_var": 0.062451171875, "learning_rate": 0.0001, "loss": 8.3365, "loss/crossentropy": 2.1112619042396545, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.29667581617832184, "step": 2056 }, { "epoch": 0.128625, "grad_norm": 3.34375, "grad_norm_var": 0.05698954264322917, "learning_rate": 0.0001, "loss": 8.5283, "loss/crossentropy": 2.365256905555725, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2903009057044983, "step": 2058 }, { "epoch": 0.12875, "grad_norm": 3.484375, "grad_norm_var": 0.05921122233072917, "learning_rate": 0.0001, "loss": 8.4521, "loss/crossentropy": 2.475534200668335, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.29650408029556274, "step": 2060 }, { "epoch": 0.128875, "grad_norm": 3.671875, "grad_norm_var": 0.06564127604166667, "learning_rate": 0.0001, "loss": 8.7537, "loss/crossentropy": 2.352292776107788, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3086307942867279, "step": 2062 }, { "epoch": 0.129, "grad_norm": 3.140625, "grad_norm_var": 0.06318359375, "learning_rate": 0.0001, "loss": 8.6444, "loss/crossentropy": 2.4217324256896973, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2903679832816124, "step": 2064 }, { "epoch": 0.129125, "grad_norm": 3.359375, "grad_norm_var": 0.05821940104166667, "learning_rate": 0.0001, "loss": 8.7567, "loss/crossentropy": 2.4540868997573853, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2984987795352936, "step": 2066 }, { "epoch": 0.12925, "grad_norm": 3.09375, "grad_norm_var": 0.043505859375, "learning_rate": 0.0001, "loss": 8.6336, "loss/crossentropy": 2.3689513206481934, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.2793779522180557, "step": 2068 }, { "epoch": 0.129375, "grad_norm": 3.25, "grad_norm_var": 0.028465779622395833, "learning_rate": 0.0001, "loss": 8.5215, "loss/crossentropy": 2.1646922826766968, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.26366159319877625, "step": 2070 }, { "epoch": 0.1295, "grad_norm": 3.21875, "grad_norm_var": 0.0303619384765625, "learning_rate": 0.0001, "loss": 8.4048, "loss/crossentropy": 2.1877001523971558, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2803114950656891, "step": 2072 }, { "epoch": 0.129625, "grad_norm": 3.484375, "grad_norm_var": 0.027302042643229166, "learning_rate": 0.0001, "loss": 8.5352, "loss/crossentropy": 2.2874704599380493, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.28932854533195496, "step": 2074 }, { "epoch": 0.12975, "grad_norm": 3.578125, "grad_norm_var": 0.029963175455729168, "learning_rate": 0.0001, "loss": 8.4727, "loss/crossentropy": 2.2664074897766113, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2693224251270294, "step": 2076 }, { "epoch": 0.129875, "grad_norm": 3.8125, "grad_norm_var": 0.037531534830729164, "learning_rate": 0.0001, "loss": 8.4652, "loss/crossentropy": 2.199268341064453, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.27426885068416595, "step": 2078 }, { "epoch": 0.13, "grad_norm": 3.109375, "grad_norm_var": 0.0367828369140625, "learning_rate": 0.0001, "loss": 8.4824, "loss/crossentropy": 2.352415919303894, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2618487477302551, "step": 2080 }, { "epoch": 0.130125, "grad_norm": 3.9375, "grad_norm_var": 0.0666900634765625, "learning_rate": 0.0001, "loss": 8.5941, "loss/crossentropy": 2.2601778507232666, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.258955180644989, "step": 2082 }, { "epoch": 0.13025, "grad_norm": 3.453125, "grad_norm_var": 0.078662109375, "learning_rate": 0.0001, "loss": 8.5982, "loss/crossentropy": 2.3822600841522217, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3113895505666733, "step": 2084 }, { "epoch": 0.130375, "grad_norm": 3.359375, "grad_norm_var": 0.07548726399739583, "learning_rate": 0.0001, "loss": 8.5254, "loss/crossentropy": 1.9892160892486572, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.32570265233516693, "step": 2086 }, { "epoch": 0.1305, "grad_norm": 3.46875, "grad_norm_var": 0.07178446451822916, "learning_rate": 0.0001, "loss": 8.6493, "loss/crossentropy": 2.58270800113678, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2828732579946518, "step": 2088 }, { "epoch": 0.130625, "grad_norm": 3.296875, "grad_norm_var": 0.0705718994140625, "learning_rate": 0.0001, "loss": 8.5135, "loss/crossentropy": 2.279938220977783, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2838123142719269, "step": 2090 }, { "epoch": 0.13075, "grad_norm": 3.40625, "grad_norm_var": 0.06545817057291667, "learning_rate": 0.0001, "loss": 8.7187, "loss/crossentropy": 2.6231149435043335, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2854938954114914, "step": 2092 }, { "epoch": 0.130875, "grad_norm": 3.125, "grad_norm_var": 0.06187744140625, "learning_rate": 0.0001, "loss": 8.5052, "loss/crossentropy": 2.239496946334839, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2700735479593277, "step": 2094 }, { "epoch": 0.131, "grad_norm": 3.4375, "grad_norm_var": 0.07086181640625, "learning_rate": 0.0001, "loss": 8.4547, "loss/crossentropy": 2.1993571519851685, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2748275548219681, "step": 2096 }, { "epoch": 0.131125, "grad_norm": 3.1875, "grad_norm_var": 0.0490875244140625, "learning_rate": 0.0001, "loss": 8.5754, "loss/crossentropy": 2.477536678314209, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.29622790217399597, "step": 2098 }, { "epoch": 0.13125, "grad_norm": 3.046875, "grad_norm_var": 0.05208231608072917, "learning_rate": 0.0001, "loss": 8.1506, "loss/crossentropy": 2.280876398086548, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.26434236764907837, "step": 2100 }, { "epoch": 0.131375, "grad_norm": 3.328125, "grad_norm_var": 0.051610310872395836, "learning_rate": 0.0001, "loss": 8.7723, "loss/crossentropy": 2.1745803356170654, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.2727068364620209, "step": 2102 }, { "epoch": 0.1315, "grad_norm": 3.78125, "grad_norm_var": 0.06534830729166667, "learning_rate": 0.0001, "loss": 8.2232, "loss/crossentropy": 1.9637106657028198, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.26627399027347565, "step": 2104 }, { "epoch": 0.131625, "grad_norm": 3.28125, "grad_norm_var": 0.06668294270833333, "learning_rate": 0.0001, "loss": 8.5942, "loss/crossentropy": 2.588309407234192, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.30993786454200745, "step": 2106 }, { "epoch": 0.13175, "grad_norm": 3.28125, "grad_norm_var": 0.06748046875, "learning_rate": 0.0001, "loss": 8.6101, "loss/crossentropy": 2.205321490764618, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2846221327781677, "step": 2108 }, { "epoch": 0.131875, "grad_norm": 3.4375, "grad_norm_var": 0.07307535807291667, "learning_rate": 0.0001, "loss": 8.5935, "loss/crossentropy": 2.3736764192581177, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.28522568941116333, "step": 2110 }, { "epoch": 0.132, "grad_norm": 3.171875, "grad_norm_var": 0.046284993489583336, "learning_rate": 0.0001, "loss": 8.403, "loss/crossentropy": 2.153502941131592, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.27154064178466797, "step": 2112 }, { "epoch": 0.132125, "grad_norm": 2.984375, "grad_norm_var": 0.04732666015625, "learning_rate": 0.0001, "loss": 8.3663, "loss/crossentropy": 2.2930142879486084, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2532115802168846, "step": 2114 }, { "epoch": 0.13225, "grad_norm": 3.3125, "grad_norm_var": 0.043701171875, "learning_rate": 0.0001, "loss": 8.6428, "loss/crossentropy": 2.0804185271263123, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.28456906974315643, "step": 2116 }, { "epoch": 0.132375, "grad_norm": 2.84375, "grad_norm_var": 0.060944620768229166, "learning_rate": 0.0001, "loss": 8.0339, "loss/crossentropy": 2.086849570274353, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.23756709694862366, "step": 2118 }, { "epoch": 0.1325, "grad_norm": 4.25, "grad_norm_var": 0.10725809733072916, "learning_rate": 0.0001, "loss": 8.7021, "loss/crossentropy": 2.2268903255462646, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3071790784597397, "step": 2120 }, { "epoch": 0.132625, "grad_norm": 3.328125, "grad_norm_var": 0.1111480712890625, "learning_rate": 0.0001, "loss": 8.471, "loss/crossentropy": 2.027718186378479, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2703370004892349, "step": 2122 }, { "epoch": 0.13275, "grad_norm": 3.21875, "grad_norm_var": 0.111767578125, "learning_rate": 0.0001, "loss": 8.6599, "loss/crossentropy": 2.662946105003357, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.29691681265830994, "step": 2124 }, { "epoch": 0.132875, "grad_norm": 3.34375, "grad_norm_var": 0.10427958170572917, "learning_rate": 0.0001, "loss": 8.5067, "loss/crossentropy": 2.0567835569381714, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2782011330127716, "step": 2126 }, { "epoch": 0.133, "grad_norm": 3.109375, "grad_norm_var": 0.10475972493489584, "learning_rate": 0.0001, "loss": 8.3973, "loss/crossentropy": 2.2174649238586426, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.26164379715919495, "step": 2128 }, { "epoch": 0.133125, "grad_norm": 3.296875, "grad_norm_var": 0.0942779541015625, "learning_rate": 0.0001, "loss": 8.2236, "loss/crossentropy": 2.150222599506378, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2761870324611664, "step": 2130 }, { "epoch": 0.13325, "grad_norm": 3.125, "grad_norm_var": 0.0951324462890625, "learning_rate": 0.0001, "loss": 8.3323, "loss/crossentropy": 2.324468970298767, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28926678001880646, "step": 2132 }, { "epoch": 0.133375, "grad_norm": 3.125, "grad_norm_var": 0.0770904541015625, "learning_rate": 0.0001, "loss": 8.5398, "loss/crossentropy": 2.462960124015808, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.29281045496463776, "step": 2134 }, { "epoch": 0.1335, "grad_norm": 3.359375, "grad_norm_var": 0.018277994791666665, "learning_rate": 0.0001, "loss": 8.627, "loss/crossentropy": 2.2845258712768555, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.30091987550258636, "step": 2136 }, { "epoch": 0.133625, "grad_norm": 3.421875, "grad_norm_var": 0.06393941243489583, "learning_rate": 0.0001, "loss": 8.734, "loss/crossentropy": 2.3022284507751465, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29516495764255524, "step": 2138 }, { "epoch": 0.13375, "grad_norm": 3.390625, "grad_norm_var": 0.06435445149739584, "learning_rate": 0.0001, "loss": 8.3216, "loss/crossentropy": 1.9607903957366943, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2719266712665558, "step": 2140 }, { "epoch": 0.133875, "grad_norm": 3.703125, "grad_norm_var": 0.14335530598958332, "learning_rate": 0.0001, "loss": 8.524, "loss/crossentropy": 2.459964156150818, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.32108771800994873, "step": 2142 }, { "epoch": 0.134, "grad_norm": 3.53125, "grad_norm_var": 0.17431233723958334, "learning_rate": 0.0001, "loss": 8.7612, "loss/crossentropy": 2.313539743423462, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.320299968123436, "step": 2144 }, { "epoch": 0.134125, "grad_norm": 3.875, "grad_norm_var": 0.18583984375, "learning_rate": 0.0001, "loss": 8.4788, "loss/crossentropy": 2.3377647399902344, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2800298482179642, "step": 2146 }, { "epoch": 0.13425, "grad_norm": 3.265625, "grad_norm_var": 0.17565104166666667, "learning_rate": 0.0001, "loss": 8.4365, "loss/crossentropy": 2.123021960258484, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2664967030286789, "step": 2148 }, { "epoch": 0.134375, "grad_norm": 3.328125, "grad_norm_var": 0.16220296223958333, "learning_rate": 0.0001, "loss": 8.2453, "loss/crossentropy": 2.2075616121292114, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.28618696331977844, "step": 2150 }, { "epoch": 0.1345, "grad_norm": 5.125, "grad_norm_var": 0.29674072265625, "learning_rate": 0.0001, "loss": 8.4086, "loss/crossentropy": 1.887313187122345, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.25394419580698013, "step": 2152 }, { "epoch": 0.134625, "grad_norm": 3.375, "grad_norm_var": 0.2922027587890625, "learning_rate": 0.0001, "loss": 8.5106, "loss/crossentropy": 2.190902292728424, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2828090041875839, "step": 2154 }, { "epoch": 0.13475, "grad_norm": 3.3125, "grad_norm_var": 0.30569661458333336, "learning_rate": 0.0001, "loss": 8.1465, "loss/crossentropy": 2.044608175754547, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.26675868034362793, "step": 2156 }, { "epoch": 0.134875, "grad_norm": 3.21875, "grad_norm_var": 0.2681793212890625, "learning_rate": 0.0001, "loss": 8.2373, "loss/crossentropy": 2.2066221237182617, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2483505755662918, "step": 2158 }, { "epoch": 0.135, "grad_norm": 5.25, "grad_norm_var": 0.45745442708333334, "learning_rate": 0.0001, "loss": 8.4411, "loss/crossentropy": 2.147895574569702, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2631945163011551, "step": 2160 }, { "epoch": 0.135125, "grad_norm": 3.015625, "grad_norm_var": 0.47237040201822916, "learning_rate": 0.0001, "loss": 8.6435, "loss/crossentropy": 2.4051777124404907, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.30923953652381897, "step": 2162 }, { "epoch": 0.13525, "grad_norm": 3.109375, "grad_norm_var": 0.47711588541666666, "learning_rate": 0.0001, "loss": 8.7551, "loss/crossentropy": 2.5131943225860596, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2815927714109421, "step": 2164 }, { "epoch": 0.135375, "grad_norm": 3.34375, "grad_norm_var": 0.47489827473958335, "learning_rate": 0.0001, "loss": 8.4668, "loss/crossentropy": 1.8161372542381287, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.24684634804725647, "step": 2166 }, { "epoch": 0.1355, "grad_norm": 3.734375, "grad_norm_var": 0.37408447265625, "learning_rate": 0.0001, "loss": 8.83, "loss/crossentropy": 2.2895381450653076, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.31556376814842224, "step": 2168 }, { "epoch": 0.135625, "grad_norm": 3.53125, "grad_norm_var": 0.39519856770833334, "learning_rate": 0.0001, "loss": 8.4855, "loss/crossentropy": 2.247248888015747, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.29209309816360474, "step": 2170 }, { "epoch": 0.13575, "grad_norm": 3.109375, "grad_norm_var": 0.3930409749348958, "learning_rate": 0.0001, "loss": 8.3079, "loss/crossentropy": 2.232522487640381, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.26617686450481415, "step": 2172 }, { "epoch": 0.135875, "grad_norm": 3.390625, "grad_norm_var": 0.40654195149739586, "learning_rate": 0.0001, "loss": 8.3587, "loss/crossentropy": 2.297611117362976, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2740744650363922, "step": 2174 }, { "epoch": 0.136, "grad_norm": 3.875, "grad_norm_var": 0.34224853515625, "learning_rate": 0.0001, "loss": 8.4817, "loss/crossentropy": 2.3698227405548096, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.294612318277359, "step": 2176 }, { "epoch": 0.136125, "grad_norm": 3.203125, "grad_norm_var": 0.32259012858072916, "learning_rate": 0.0001, "loss": 8.3781, "loss/crossentropy": 2.1921546459198, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2621006965637207, "step": 2178 }, { "epoch": 0.13625, "grad_norm": 3.0625, "grad_norm_var": 0.32325846354166665, "learning_rate": 0.0001, "loss": 8.5063, "loss/crossentropy": 2.149673104286194, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.31072892248630524, "step": 2180 }, { "epoch": 0.136375, "grad_norm": 3.0625, "grad_norm_var": 0.3575846354166667, "learning_rate": 0.0001, "loss": 8.2498, "loss/crossentropy": 2.3115307092666626, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.25833451747894287, "step": 2182 }, { "epoch": 0.1365, "grad_norm": 3.3125, "grad_norm_var": 0.2910227457682292, "learning_rate": 0.0001, "loss": 8.7784, "loss/crossentropy": 2.2834960222244263, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2899049371480942, "step": 2184 }, { "epoch": 0.136625, "grad_norm": 3.203125, "grad_norm_var": 0.2539133707682292, "learning_rate": 0.0001, "loss": 8.5944, "loss/crossentropy": 2.504146695137024, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.29903455078601837, "step": 2186 }, { "epoch": 0.13675, "grad_norm": 3.078125, "grad_norm_var": 0.2524322509765625, "learning_rate": 0.0001, "loss": 8.6814, "loss/crossentropy": 2.2595409154891968, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2784164845943451, "step": 2188 }, { "epoch": 0.136875, "grad_norm": 3.53125, "grad_norm_var": 0.24133707682291666, "learning_rate": 0.0001, "loss": 8.4609, "loss/crossentropy": 2.294034481048584, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2981789708137512, "step": 2190 }, { "epoch": 0.137, "grad_norm": 3.625, "grad_norm_var": 0.046873982747395834, "learning_rate": 0.0001, "loss": 8.6629, "loss/crossentropy": 2.3820838928222656, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3091062009334564, "step": 2192 }, { "epoch": 0.137125, "grad_norm": 4.0625, "grad_norm_var": 0.08557942708333334, "learning_rate": 0.0001, "loss": 8.3423, "loss/crossentropy": 1.8493825197219849, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2580011934041977, "step": 2194 }, { "epoch": 0.13725, "grad_norm": 3.390625, "grad_norm_var": 0.0845611572265625, "learning_rate": 0.0001, "loss": 8.5883, "loss/crossentropy": 2.4166128635406494, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2578553408384323, "step": 2196 }, { "epoch": 0.137375, "grad_norm": 3.421875, "grad_norm_var": 0.06991780598958333, "learning_rate": 0.0001, "loss": 8.5704, "loss/crossentropy": 2.3514604568481445, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.28164079785346985, "step": 2198 }, { "epoch": 0.1375, "grad_norm": 3.328125, "grad_norm_var": 0.063623046875, "learning_rate": 0.0001, "loss": 8.4875, "loss/crossentropy": 2.224833130836487, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2789185047149658, "step": 2200 }, { "epoch": 0.137625, "grad_norm": 3.21875, "grad_norm_var": 0.07244466145833334, "learning_rate": 0.0001, "loss": 8.3003, "loss/crossentropy": 2.208402156829834, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.2867588549852371, "step": 2202 }, { "epoch": 0.13775, "grad_norm": 3.25, "grad_norm_var": 0.07492574055989583, "learning_rate": 0.0001, "loss": 8.4887, "loss/crossentropy": 1.9286993741989136, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.25911755859851837, "step": 2204 }, { "epoch": 0.137875, "grad_norm": 4.25, "grad_norm_var": 0.11220296223958333, "learning_rate": 0.0001, "loss": 8.7322, "loss/crossentropy": 2.3510414361953735, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.29371631145477295, "step": 2206 }, { "epoch": 0.138, "grad_norm": 4.625, "grad_norm_var": 0.20168355305989583, "learning_rate": 0.0001, "loss": 8.1643, "loss/crossentropy": 2.2828208208084106, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2722548693418503, "step": 2208 }, { "epoch": 0.138125, "grad_norm": 3.453125, "grad_norm_var": 0.16903889973958333, "learning_rate": 0.0001, "loss": 8.7001, "loss/crossentropy": 2.4297702312469482, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3309246301651001, "step": 2210 }, { "epoch": 0.13825, "grad_norm": 3.9375, "grad_norm_var": 0.17733968098958333, "learning_rate": 0.0001, "loss": 8.8431, "loss/crossentropy": 2.216072678565979, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.315808430314064, "step": 2212 }, { "epoch": 0.138375, "grad_norm": 3.21875, "grad_norm_var": 0.19077860514322917, "learning_rate": 0.0001, "loss": 8.3663, "loss/crossentropy": 2.3218764066696167, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2762540280818939, "step": 2214 }, { "epoch": 0.1385, "grad_norm": 3.125, "grad_norm_var": 0.2101470947265625, "learning_rate": 0.0001, "loss": 8.1269, "loss/crossentropy": 2.1518408060073853, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2575703561306, "step": 2216 }, { "epoch": 0.138625, "grad_norm": 3.328125, "grad_norm_var": 0.19173177083333334, "learning_rate": 0.0001, "loss": 8.602, "loss/crossentropy": 2.468363881111145, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.27463802695274353, "step": 2218 }, { "epoch": 0.13875, "grad_norm": 3.4375, "grad_norm_var": 0.18999735514322916, "learning_rate": 0.0001, "loss": 8.6541, "loss/crossentropy": 2.109190046787262, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2649577260017395, "step": 2220 }, { "epoch": 0.138875, "grad_norm": 2.84375, "grad_norm_var": 0.17636311848958333, "learning_rate": 0.0001, "loss": 8.2879, "loss/crossentropy": 2.2118396759033203, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.26701056957244873, "step": 2222 }, { "epoch": 0.139, "grad_norm": 3.421875, "grad_norm_var": 0.0676177978515625, "learning_rate": 0.0001, "loss": 8.6078, "loss/crossentropy": 2.1393160820007324, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2829648405313492, "step": 2224 }, { "epoch": 0.139125, "grad_norm": 3.109375, "grad_norm_var": 0.06685282389322916, "learning_rate": 0.0001, "loss": 8.3696, "loss/crossentropy": 2.331485629081726, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.271269828081131, "step": 2226 }, { "epoch": 0.13925, "grad_norm": 3.484375, "grad_norm_var": 0.0275543212890625, "learning_rate": 0.0001, "loss": 8.4111, "loss/crossentropy": 2.121020555496216, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2616734802722931, "step": 2228 }, { "epoch": 0.139375, "grad_norm": 3.1875, "grad_norm_var": 0.028612263997395835, "learning_rate": 0.0001, "loss": 8.5096, "loss/crossentropy": 2.2679263949394226, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3067668527364731, "step": 2230 }, { "epoch": 0.1395, "grad_norm": 3.015625, "grad_norm_var": 0.0309967041015625, "learning_rate": 0.0001, "loss": 8.4079, "loss/crossentropy": 2.4295634031295776, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2828514277935028, "step": 2232 }, { "epoch": 0.139625, "grad_norm": 3.21875, "grad_norm_var": 0.030052693684895833, "learning_rate": 0.0001, "loss": 8.2772, "loss/crossentropy": 2.0072978734970093, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2873122990131378, "step": 2234 }, { "epoch": 0.13975, "grad_norm": 3.09375, "grad_norm_var": 0.0290191650390625, "learning_rate": 0.0001, "loss": 8.3857, "loss/crossentropy": 2.435922622680664, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2705515921115875, "step": 2236 }, { "epoch": 0.139875, "grad_norm": 3.328125, "grad_norm_var": 0.020490519205729165, "learning_rate": 0.0001, "loss": 8.2538, "loss/crossentropy": 2.279047131538391, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27282020449638367, "step": 2238 }, { "epoch": 0.14, "grad_norm": 3.359375, "grad_norm_var": 0.018651326497395832, "learning_rate": 0.0001, "loss": 8.4812, "loss/crossentropy": 2.0159464478492737, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2737518176436424, "step": 2240 }, { "epoch": 0.140125, "grad_norm": 2.96875, "grad_norm_var": 0.022166951497395834, "learning_rate": 0.0001, "loss": 8.2228, "loss/crossentropy": 2.1809054017066956, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.24882421642541885, "step": 2242 }, { "epoch": 0.14025, "grad_norm": 3.1875, "grad_norm_var": 0.015804036458333334, "learning_rate": 0.0001, "loss": 8.6345, "loss/crossentropy": 2.165052652359009, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2511095628142357, "step": 2244 }, { "epoch": 0.140375, "grad_norm": 3.765625, "grad_norm_var": 0.03730061848958333, "learning_rate": 0.0001, "loss": 8.6016, "loss/crossentropy": 2.254419445991516, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.30638401210308075, "step": 2246 }, { "epoch": 0.1405, "grad_norm": 2.859375, "grad_norm_var": 0.04080403645833333, "learning_rate": 0.0001, "loss": 8.465, "loss/crossentropy": 2.5244404077529907, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2933734357357025, "step": 2248 }, { "epoch": 0.140625, "grad_norm": 3.515625, "grad_norm_var": 0.04674072265625, "learning_rate": 0.0001, "loss": 8.6814, "loss/crossentropy": 2.517358899116516, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2738707587122917, "step": 2250 }, { "epoch": 0.14075, "grad_norm": 3.203125, "grad_norm_var": 0.04534098307291667, "learning_rate": 0.0001, "loss": 8.2856, "loss/crossentropy": 2.006367027759552, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2857169508934021, "step": 2252 }, { "epoch": 0.140875, "grad_norm": 3.296875, "grad_norm_var": 0.045775349934895834, "learning_rate": 0.0001, "loss": 8.6663, "loss/crossentropy": 2.3400460481643677, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2911923974752426, "step": 2254 }, { "epoch": 0.141, "grad_norm": 3.359375, "grad_norm_var": 0.044657389322916664, "learning_rate": 0.0001, "loss": 8.4925, "loss/crossentropy": 2.3501226902008057, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.29308466613292694, "step": 2256 }, { "epoch": 0.141125, "grad_norm": 3.453125, "grad_norm_var": 0.0412261962890625, "learning_rate": 0.0001, "loss": 8.3601, "loss/crossentropy": 2.314428687095642, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2409065216779709, "step": 2258 }, { "epoch": 0.14125, "grad_norm": 2.84375, "grad_norm_var": 0.0558502197265625, "learning_rate": 0.0001, "loss": 8.3079, "loss/crossentropy": 2.311669945716858, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.26933349668979645, "step": 2260 }, { "epoch": 0.141375, "grad_norm": 3.015625, "grad_norm_var": 0.039632161458333336, "learning_rate": 0.0001, "loss": 8.0227, "loss/crossentropy": 2.1360250115394592, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.24210172146558762, "step": 2262 }, { "epoch": 0.1415, "grad_norm": 3.140625, "grad_norm_var": 0.03661702473958333, "learning_rate": 0.0001, "loss": 8.075, "loss/crossentropy": 2.298375964164734, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2606397569179535, "step": 2264 }, { "epoch": 0.141625, "grad_norm": 3.046875, "grad_norm_var": 0.029393513997395832, "learning_rate": 0.0001, "loss": 8.0933, "loss/crossentropy": 2.2362215518951416, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2660336494445801, "step": 2266 }, { "epoch": 0.14175, "grad_norm": 3.5, "grad_norm_var": 0.038141886393229164, "learning_rate": 0.0001, "loss": 8.1783, "loss/crossentropy": 2.0173644423484802, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2474978342652321, "step": 2268 }, { "epoch": 0.141875, "grad_norm": 2.8125, "grad_norm_var": 0.04566650390625, "learning_rate": 0.0001, "loss": 7.8623, "loss/crossentropy": 2.1669762134552, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.23262600600719452, "step": 2270 }, { "epoch": 0.142, "grad_norm": 3.203125, "grad_norm_var": 0.044661458333333334, "learning_rate": 0.0001, "loss": 8.1598, "loss/crossentropy": 2.128685235977173, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.25580504536628723, "step": 2272 }, { "epoch": 0.142125, "grad_norm": 3.34375, "grad_norm_var": 0.042235310872395834, "learning_rate": 0.0001, "loss": 8.6214, "loss/crossentropy": 1.9299900531768799, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.25190970301628113, "step": 2274 }, { "epoch": 0.14225, "grad_norm": 3.1875, "grad_norm_var": 0.03828837076822917, "learning_rate": 0.0001, "loss": 8.4324, "loss/crossentropy": 2.3441572189331055, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2926934063434601, "step": 2276 }, { "epoch": 0.142375, "grad_norm": 5.25, "grad_norm_var": 0.3029581705729167, "learning_rate": 0.0001, "loss": 8.4633, "loss/crossentropy": 2.2406026124954224, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2710404098033905, "step": 2278 }, { "epoch": 0.1425, "grad_norm": 3.28125, "grad_norm_var": 0.3035807291666667, "learning_rate": 0.0001, "loss": 8.5417, "loss/crossentropy": 2.1938705444335938, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28860265016555786, "step": 2280 }, { "epoch": 0.142625, "grad_norm": 2.96875, "grad_norm_var": 0.3068918863932292, "learning_rate": 0.0001, "loss": 8.3029, "loss/crossentropy": 2.1168991327285767, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.30032673478126526, "step": 2282 }, { "epoch": 0.14275, "grad_norm": 3.21875, "grad_norm_var": 0.30548502604166666, "learning_rate": 0.0001, "loss": 8.4951, "loss/crossentropy": 2.35022509098053, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2850249111652374, "step": 2284 }, { "epoch": 0.142875, "grad_norm": 3.375, "grad_norm_var": 0.3026652018229167, "learning_rate": 0.0001, "loss": 8.6239, "loss/crossentropy": 2.199749708175659, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2940863221883774, "step": 2286 }, { "epoch": 0.143, "grad_norm": 3.25, "grad_norm_var": 0.29019775390625, "learning_rate": 0.0001, "loss": 8.5977, "loss/crossentropy": 2.1447832584381104, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2969307154417038, "step": 2288 }, { "epoch": 0.143125, "grad_norm": 4.1875, "grad_norm_var": 0.33202718098958334, "learning_rate": 0.0001, "loss": 8.6013, "loss/crossentropy": 2.9763671159744263, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.28709542751312256, "step": 2290 }, { "epoch": 0.14325, "grad_norm": 3.375, "grad_norm_var": 0.32502848307291665, "learning_rate": 0.0001, "loss": 8.3914, "loss/crossentropy": 2.138368010520935, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.27638138830661774, "step": 2292 }, { "epoch": 0.143375, "grad_norm": 3.140625, "grad_norm_var": 0.10810139973958334, "learning_rate": 0.0001, "loss": 8.3082, "loss/crossentropy": 2.300687074661255, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.257497176527977, "step": 2294 }, { "epoch": 0.1435, "grad_norm": 3.1875, "grad_norm_var": 0.11966145833333333, "learning_rate": 0.0001, "loss": 8.3761, "loss/crossentropy": 2.491214394569397, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29092299938201904, "step": 2296 }, { "epoch": 0.143625, "grad_norm": 3.359375, "grad_norm_var": 0.1099273681640625, "learning_rate": 0.0001, "loss": 8.2483, "loss/crossentropy": 2.437252640724182, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.3005269169807434, "step": 2298 }, { "epoch": 0.14375, "grad_norm": 2.984375, "grad_norm_var": 0.12758687337239583, "learning_rate": 0.0001, "loss": 8.246, "loss/crossentropy": 2.3133788108825684, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2604673504829407, "step": 2300 }, { "epoch": 0.143875, "grad_norm": 3.203125, "grad_norm_var": 0.10556233723958333, "learning_rate": 0.0001, "loss": 8.5592, "loss/crossentropy": 2.2636178731918335, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.29124458134174347, "step": 2302 }, { "epoch": 0.144, "grad_norm": 4.1875, "grad_norm_var": 0.15493062337239583, "learning_rate": 0.0001, "loss": 8.4898, "loss/crossentropy": 2.585403323173523, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2634608894586563, "step": 2304 }, { "epoch": 0.144125, "grad_norm": 3.3125, "grad_norm_var": 0.08720296223958333, "learning_rate": 0.0001, "loss": 8.5296, "loss/crossentropy": 2.421893000602722, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.27335743606090546, "step": 2306 }, { "epoch": 0.14425, "grad_norm": 2.96875, "grad_norm_var": 0.08804931640625, "learning_rate": 0.0001, "loss": 8.4971, "loss/crossentropy": 2.302649974822998, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2991143465042114, "step": 2308 }, { "epoch": 0.144375, "grad_norm": 3.03125, "grad_norm_var": 0.08941650390625, "learning_rate": 0.0001, "loss": 8.3063, "loss/crossentropy": 2.2079267501831055, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.27418044209480286, "step": 2310 }, { "epoch": 0.1445, "grad_norm": 3.140625, "grad_norm_var": 0.08531494140625, "learning_rate": 0.0001, "loss": 8.1061, "loss/crossentropy": 2.2134305238723755, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2882683426141739, "step": 2312 }, { "epoch": 0.144625, "grad_norm": 3.140625, "grad_norm_var": 0.08690999348958334, "learning_rate": 0.0001, "loss": 8.4894, "loss/crossentropy": 2.1350191831588745, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3250105082988739, "step": 2314 }, { "epoch": 0.14475, "grad_norm": 3.28125, "grad_norm_var": 0.08273111979166667, "learning_rate": 0.0001, "loss": 8.2803, "loss/crossentropy": 2.162496566772461, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2692483738064766, "step": 2316 }, { "epoch": 0.144875, "grad_norm": 3.546875, "grad_norm_var": 0.08546549479166667, "learning_rate": 0.0001, "loss": 8.2229, "loss/crossentropy": 2.1046829223632812, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.23328028619289398, "step": 2318 }, { "epoch": 0.145, "grad_norm": 4.25, "grad_norm_var": 0.0941314697265625, "learning_rate": 0.0001, "loss": 8.6969, "loss/crossentropy": 2.144542932510376, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.27399805188179016, "step": 2320 }, { "epoch": 0.145125, "grad_norm": 3.09375, "grad_norm_var": 0.09934895833333333, "learning_rate": 0.0001, "loss": 8.2252, "loss/crossentropy": 2.0601924061775208, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.25523945689201355, "step": 2322 }, { "epoch": 0.14525, "grad_norm": 3.0, "grad_norm_var": 0.09812723795572917, "learning_rate": 0.0001, "loss": 8.2399, "loss/crossentropy": 1.8771136403083801, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.249388687312603, "step": 2324 }, { "epoch": 0.145375, "grad_norm": 3.34375, "grad_norm_var": 0.09247639973958334, "learning_rate": 0.0001, "loss": 8.3756, "loss/crossentropy": 2.1348198652267456, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.27005739510059357, "step": 2326 }, { "epoch": 0.1455, "grad_norm": 4.34375, "grad_norm_var": 0.44801025390625, "learning_rate": 0.0001, "loss": 8.6195, "loss/crossentropy": 2.4480077028274536, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.2902005910873413, "step": 2328 }, { "epoch": 0.145625, "grad_norm": 3.375, "grad_norm_var": 0.44322509765625, "learning_rate": 0.0001, "loss": 8.4845, "loss/crossentropy": 2.32487416267395, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.31037984788417816, "step": 2330 }, { "epoch": 0.14575, "grad_norm": 3.28125, "grad_norm_var": 0.4233632405598958, "learning_rate": 0.0001, "loss": 8.3503, "loss/crossentropy": 2.242250680923462, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29078349471092224, "step": 2332 }, { "epoch": 0.145875, "grad_norm": 3.21875, "grad_norm_var": 0.431884765625, "learning_rate": 0.0001, "loss": 8.5279, "loss/crossentropy": 2.4411145448684692, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28765274584293365, "step": 2334 }, { "epoch": 0.146, "grad_norm": 3.328125, "grad_norm_var": 0.40498046875, "learning_rate": 0.0001, "loss": 8.4669, "loss/crossentropy": 2.0890385508537292, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2982271760702133, "step": 2336 }, { "epoch": 0.146125, "grad_norm": 3.453125, "grad_norm_var": 0.39547526041666664, "learning_rate": 0.0001, "loss": 8.5609, "loss/crossentropy": 2.087196469306946, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2540636509656906, "step": 2338 }, { "epoch": 0.14625, "grad_norm": 3.46875, "grad_norm_var": 0.3778605143229167, "learning_rate": 0.0001, "loss": 8.3474, "loss/crossentropy": 2.168229579925537, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2915681451559067, "step": 2340 }, { "epoch": 0.146375, "grad_norm": 3.328125, "grad_norm_var": 0.37956441243489586, "learning_rate": 0.0001, "loss": 8.4842, "loss/crossentropy": 2.0825737714767456, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.32262521982192993, "step": 2342 }, { "epoch": 0.1465, "grad_norm": 3.046875, "grad_norm_var": 0.02125244140625, "learning_rate": 0.0001, "loss": 8.4115, "loss/crossentropy": 2.0815482139587402, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.26708245277404785, "step": 2344 }, { "epoch": 0.146625, "grad_norm": 3.546875, "grad_norm_var": 0.7017567952473959, "learning_rate": 0.0001, "loss": 8.8283, "loss/crossentropy": 2.1069406867027283, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.40591706335544586, "step": 2346 }, { "epoch": 0.14675, "grad_norm": 4.46875, "grad_norm_var": 0.7509073893229167, "learning_rate": 0.0001, "loss": 8.3247, "loss/crossentropy": 2.1671907901763916, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.2940974235534668, "step": 2348 }, { "epoch": 0.146875, "grad_norm": 3.15625, "grad_norm_var": 0.7454905192057292, "learning_rate": 0.0001, "loss": 8.3786, "loss/crossentropy": 2.1944754123687744, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.30334649980068207, "step": 2350 }, { "epoch": 0.147, "grad_norm": 3.484375, "grad_norm_var": 0.7332834879557292, "learning_rate": 0.0001, "loss": 8.4307, "loss/crossentropy": 2.402153253555298, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.27832649648189545, "step": 2352 }, { "epoch": 0.147125, "grad_norm": 3.0625, "grad_norm_var": 0.7498524983723959, "learning_rate": 0.0001, "loss": 8.4974, "loss/crossentropy": 2.1214135885238647, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.273033007979393, "step": 2354 }, { "epoch": 0.14725, "grad_norm": 2.921875, "grad_norm_var": 0.7869049072265625, "learning_rate": 0.0001, "loss": 8.3771, "loss/crossentropy": 2.073447108268738, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.24891676753759384, "step": 2356 }, { "epoch": 0.147375, "grad_norm": 3.375, "grad_norm_var": 0.7880167643229167, "learning_rate": 0.0001, "loss": 8.3275, "loss/crossentropy": 2.4699442386627197, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.261794850230217, "step": 2358 }, { "epoch": 0.1475, "grad_norm": 2.96875, "grad_norm_var": 0.8022450764973958, "learning_rate": 0.0001, "loss": 8.0362, "loss/crossentropy": 2.443465232849121, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.26376432180404663, "step": 2360 }, { "epoch": 0.147625, "grad_norm": 3.09375, "grad_norm_var": 0.18026936848958333, "learning_rate": 0.0001, "loss": 8.5294, "loss/crossentropy": 2.499496102333069, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.24655090272426605, "step": 2362 }, { "epoch": 0.14775, "grad_norm": 3.34375, "grad_norm_var": 0.09453023274739583, "learning_rate": 0.0001, "loss": 8.4863, "loss/crossentropy": 2.4230228662490845, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.3454309552907944, "step": 2364 }, { "epoch": 0.147875, "grad_norm": 3.046875, "grad_norm_var": 0.08621419270833333, "learning_rate": 0.0001, "loss": 8.1537, "loss/crossentropy": 2.1860731840133667, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2569929361343384, "step": 2366 }, { "epoch": 0.148, "grad_norm": 3.21875, "grad_norm_var": 0.08123372395833334, "learning_rate": 0.0001, "loss": 8.4979, "loss/crossentropy": 2.297507882118225, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2582452595233917, "step": 2368 }, { "epoch": 0.148125, "grad_norm": 3.125, "grad_norm_var": 0.07655843098958333, "learning_rate": 0.0001, "loss": 8.0855, "loss/crossentropy": 2.257783532142639, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.24964100122451782, "step": 2370 }, { "epoch": 0.14825, "grad_norm": 3.078125, "grad_norm_var": 0.07252604166666667, "learning_rate": 0.0001, "loss": 8.1806, "loss/crossentropy": 2.336972236633301, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.25840216875076294, "step": 2372 }, { "epoch": 0.148375, "grad_norm": 3.21875, "grad_norm_var": 0.020929972330729168, "learning_rate": 0.0001, "loss": 8.2393, "loss/crossentropy": 2.225935459136963, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.26378747820854187, "step": 2374 }, { "epoch": 0.1485, "grad_norm": 3.046875, "grad_norm_var": 0.0204010009765625, "learning_rate": 0.0001, "loss": 8.3894, "loss/crossentropy": 2.2249433994293213, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2794678509235382, "step": 2376 }, { "epoch": 0.148625, "grad_norm": 2.90625, "grad_norm_var": 0.026285807291666668, "learning_rate": 0.0001, "loss": 8.1643, "loss/crossentropy": 2.337942361831665, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2834671437740326, "step": 2378 }, { "epoch": 0.14875, "grad_norm": 2.984375, "grad_norm_var": 0.0201568603515625, "learning_rate": 0.0001, "loss": 8.2151, "loss/crossentropy": 2.287383556365967, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.26868072152137756, "step": 2380 }, { "epoch": 0.148875, "grad_norm": 3.34375, "grad_norm_var": 0.0216796875, "learning_rate": 0.0001, "loss": 8.3641, "loss/crossentropy": 2.3883267641067505, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2867826446890831, "step": 2382 }, { "epoch": 0.149, "grad_norm": 4.125, "grad_norm_var": 0.0908355712890625, "learning_rate": 0.0001, "loss": 8.4252, "loss/crossentropy": 2.524049162864685, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2781444936990738, "step": 2384 }, { "epoch": 0.149125, "grad_norm": 3.0, "grad_norm_var": 0.09146219889322917, "learning_rate": 0.0001, "loss": 8.2951, "loss/crossentropy": 2.110072374343872, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.23993606865406036, "step": 2386 }, { "epoch": 0.14925, "grad_norm": 3.46875, "grad_norm_var": 0.099462890625, "learning_rate": 0.0001, "loss": 8.3375, "loss/crossentropy": 2.0417853593826294, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28202496469020844, "step": 2388 }, { "epoch": 0.149375, "grad_norm": 3.21875, "grad_norm_var": 0.125244140625, "learning_rate": 0.0001, "loss": 8.083, "loss/crossentropy": 2.0900679230690002, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2541571632027626, "step": 2390 }, { "epoch": 0.1495, "grad_norm": 3.046875, "grad_norm_var": 0.12751363118489584, "learning_rate": 0.0001, "loss": 8.3203, "loss/crossentropy": 2.0518333315849304, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.25456421822309494, "step": 2392 }, { "epoch": 0.149625, "grad_norm": 3.515625, "grad_norm_var": 0.129931640625, "learning_rate": 0.0001, "loss": 8.4005, "loss/crossentropy": 2.3813477754592896, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2721667140722275, "step": 2394 }, { "epoch": 0.14975, "grad_norm": 3.125, "grad_norm_var": 0.12007548014322916, "learning_rate": 0.0001, "loss": 8.2645, "loss/crossentropy": 2.23868465423584, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3163903057575226, "step": 2396 }, { "epoch": 0.149875, "grad_norm": 3.421875, "grad_norm_var": 0.12371317545572917, "learning_rate": 0.0001, "loss": 8.33, "loss/crossentropy": 2.0757837891578674, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24396058917045593, "step": 2398 }, { "epoch": 0.15, "grad_norm": 2.953125, "grad_norm_var": 0.06952718098958334, "learning_rate": 0.0001, "loss": 8.375, "loss/crossentropy": 2.37544047832489, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26771050691604614, "step": 2400 }, { "epoch": 0.150125, "grad_norm": 4.75, "grad_norm_var": 0.20263264973958334, "learning_rate": 0.0001, "loss": 8.3487, "loss/crossentropy": 2.3700286149978638, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2626129686832428, "step": 2402 }, { "epoch": 0.15025, "grad_norm": 2.921875, "grad_norm_var": 0.20690104166666667, "learning_rate": 0.0001, "loss": 8.2555, "loss/crossentropy": 2.392973303794861, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.27510133385658264, "step": 2404 }, { "epoch": 0.150375, "grad_norm": 3.609375, "grad_norm_var": 0.28280843098958336, "learning_rate": 0.0001, "loss": 8.3727, "loss/crossentropy": 2.410975694656372, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.26646968722343445, "step": 2406 }, { "epoch": 0.1505, "grad_norm": 3.59375, "grad_norm_var": 0.2575480143229167, "learning_rate": 0.0001, "loss": 8.6951, "loss/crossentropy": 2.320314884185791, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3037981539964676, "step": 2408 }, { "epoch": 0.150625, "grad_norm": 3.375, "grad_norm_var": 0.25926005045572914, "learning_rate": 0.0001, "loss": 8.3943, "loss/crossentropy": 2.296256422996521, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.29959771037101746, "step": 2410 }, { "epoch": 0.15075, "grad_norm": 3.140625, "grad_norm_var": 0.2638336181640625, "learning_rate": 0.0001, "loss": 8.5054, "loss/crossentropy": 2.2240231037139893, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.28252455592155457, "step": 2412 }, { "epoch": 0.150875, "grad_norm": 3.75, "grad_norm_var": 0.26970113118489586, "learning_rate": 0.0001, "loss": 8.4365, "loss/crossentropy": 2.202714502811432, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.29341815412044525, "step": 2414 }, { "epoch": 0.151, "grad_norm": 3.265625, "grad_norm_var": 0.25357666015625, "learning_rate": 0.0001, "loss": 8.7353, "loss/crossentropy": 2.5229181051254272, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.3003344312310219, "step": 2416 }, { "epoch": 0.151125, "grad_norm": 3.1875, "grad_norm_var": 0.3305328369140625, "learning_rate": 0.0001, "loss": 8.446, "loss/crossentropy": 2.3139584064483643, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2626221179962158, "step": 2418 }, { "epoch": 0.15125, "grad_norm": 3.453125, "grad_norm_var": 0.31324869791666665, "learning_rate": 0.0001, "loss": 8.7435, "loss/crossentropy": 2.2493679523468018, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2802734673023224, "step": 2420 }, { "epoch": 0.151375, "grad_norm": 3.21875, "grad_norm_var": 0.2513834635416667, "learning_rate": 0.0001, "loss": 8.5734, "loss/crossentropy": 2.0787190198898315, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2961963415145874, "step": 2422 }, { "epoch": 0.1515, "grad_norm": 3.109375, "grad_norm_var": 0.26109619140625, "learning_rate": 0.0001, "loss": 8.5869, "loss/crossentropy": 2.2477983236312866, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.25737468898296356, "step": 2424 }, { "epoch": 0.151625, "grad_norm": 4.15625, "grad_norm_var": 0.2965159098307292, "learning_rate": 0.0001, "loss": 8.5124, "loss/crossentropy": 1.983967125415802, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28184664249420166, "step": 2426 }, { "epoch": 0.15175, "grad_norm": 3.328125, "grad_norm_var": 0.31382548014322914, "learning_rate": 0.0001, "loss": 8.5955, "loss/crossentropy": 2.3329524993896484, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3499256670475006, "step": 2428 }, { "epoch": 0.151875, "grad_norm": 3.375, "grad_norm_var": 0.31070048014322915, "learning_rate": 0.0001, "loss": 8.7101, "loss/crossentropy": 2.1876468658447266, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.29871222376823425, "step": 2430 }, { "epoch": 0.152, "grad_norm": 4.78125, "grad_norm_var": 0.3991170247395833, "learning_rate": 0.0001, "loss": 8.4217, "loss/crossentropy": 2.476602792739868, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.29820646345615387, "step": 2432 }, { "epoch": 0.152125, "grad_norm": 3.46875, "grad_norm_var": 3.324348958333333, "learning_rate": 0.0001, "loss": 8.4021, "loss/crossentropy": 2.1025315523147583, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2696252912282944, "step": 2434 }, { "epoch": 0.15225, "grad_norm": 3.578125, "grad_norm_var": 3.2503733317057293, "learning_rate": 0.0001, "loss": 8.2267, "loss/crossentropy": 2.1821004152297974, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.23813840001821518, "step": 2436 }, { "epoch": 0.152375, "grad_norm": 3.140625, "grad_norm_var": 3.300633748372396, "learning_rate": 0.0001, "loss": 8.2952, "loss/crossentropy": 2.321933627128601, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2608601897954941, "step": 2438 }, { "epoch": 0.1525, "grad_norm": 3.0, "grad_norm_var": 3.3150390625, "learning_rate": 0.0001, "loss": 8.4226, "loss/crossentropy": 2.56186044216156, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2735903561115265, "step": 2440 }, { "epoch": 0.152625, "grad_norm": 2.96875, "grad_norm_var": 3.441259765625, "learning_rate": 0.0001, "loss": 8.3983, "loss/crossentropy": 1.961831271648407, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.276444748044014, "step": 2442 }, { "epoch": 0.15275, "grad_norm": 4.25, "grad_norm_var": 3.4633778889973956, "learning_rate": 0.0001, "loss": 8.4155, "loss/crossentropy": 2.319730758666992, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.30145491659641266, "step": 2444 }, { "epoch": 0.152875, "grad_norm": 3.4375, "grad_norm_var": 3.4742828369140626, "learning_rate": 0.0001, "loss": 8.4731, "loss/crossentropy": 2.079390525817871, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.25613100826740265, "step": 2446 }, { "epoch": 0.153, "grad_norm": 6.84375, "grad_norm_var": 3.949030558268229, "learning_rate": 0.0001, "loss": 8.787, "loss/crossentropy": 2.721150040626526, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.30540551245212555, "step": 2448 }, { "epoch": 0.153125, "grad_norm": 3.078125, "grad_norm_var": 0.8867828369140625, "learning_rate": 0.0001, "loss": 8.28, "loss/crossentropy": 2.1300487518310547, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29494112730026245, "step": 2450 }, { "epoch": 0.15325, "grad_norm": 3.703125, "grad_norm_var": 1.2272135416666667, "learning_rate": 0.0001, "loss": 8.489, "loss/crossentropy": 2.0942248106002808, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2816726565361023, "step": 2452 }, { "epoch": 0.153375, "grad_norm": 3.28125, "grad_norm_var": 1.2226959228515626, "learning_rate": 0.0001, "loss": 8.2282, "loss/crossentropy": 2.200441837310791, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2815055698156357, "step": 2454 }, { "epoch": 0.1535, "grad_norm": 3.140625, "grad_norm_var": 1.2158355712890625, "learning_rate": 0.0001, "loss": 8.2848, "loss/crossentropy": 1.8995721936225891, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.26202765107154846, "step": 2456 }, { "epoch": 0.153625, "grad_norm": 3.046875, "grad_norm_var": 1.1757120768229166, "learning_rate": 0.0001, "loss": 8.3382, "loss/crossentropy": 2.301843047142029, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.243467278778553, "step": 2458 }, { "epoch": 0.15375, "grad_norm": 3.046875, "grad_norm_var": 1.1745930989583333, "learning_rate": 0.0001, "loss": 8.4128, "loss/crossentropy": 2.4178149700164795, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.26342087984085083, "step": 2460 }, { "epoch": 0.153875, "grad_norm": 3.09375, "grad_norm_var": 1.1955963134765626, "learning_rate": 0.0001, "loss": 8.3898, "loss/crossentropy": 2.4432934522628784, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.27150075137615204, "step": 2462 }, { "epoch": 0.154, "grad_norm": 3.1875, "grad_norm_var": 0.4769816080729167, "learning_rate": 0.0001, "loss": 8.3308, "loss/crossentropy": 2.268512010574341, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.264302060008049, "step": 2464 }, { "epoch": 0.154125, "grad_norm": 3.6875, "grad_norm_var": 0.47346598307291665, "learning_rate": 0.0001, "loss": 8.2828, "loss/crossentropy": 2.004506468772888, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.24829556047916412, "step": 2466 }, { "epoch": 0.15425, "grad_norm": 3.390625, "grad_norm_var": 0.0311676025390625, "learning_rate": 0.0001, "loss": 8.5268, "loss/crossentropy": 2.5557998418807983, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.27453961968421936, "step": 2468 }, { "epoch": 0.154375, "grad_norm": 4.40625, "grad_norm_var": 0.11099344889322917, "learning_rate": 0.0001, "loss": 8.5754, "loss/crossentropy": 2.509130835533142, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.31154434382915497, "step": 2470 }, { "epoch": 0.1545, "grad_norm": 3.453125, "grad_norm_var": 0.10640869140625, "learning_rate": 0.0001, "loss": 8.5946, "loss/crossentropy": 2.2127550840377808, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2670055031776428, "step": 2472 }, { "epoch": 0.154625, "grad_norm": 3.40625, "grad_norm_var": 0.0988677978515625, "learning_rate": 0.0001, "loss": 8.6314, "loss/crossentropy": 2.355626106262207, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.29068681597709656, "step": 2474 }, { "epoch": 0.15475, "grad_norm": 3.578125, "grad_norm_var": 0.33668619791666665, "learning_rate": 0.0001, "loss": 8.6396, "loss/crossentropy": 2.313707947731018, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3355162888765335, "step": 2476 }, { "epoch": 0.154875, "grad_norm": 3.03125, "grad_norm_var": 0.3482747395833333, "learning_rate": 0.0001, "loss": 8.2465, "loss/crossentropy": 2.44563364982605, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2863249182701111, "step": 2478 }, { "epoch": 0.155, "grad_norm": 3.109375, "grad_norm_var": 0.35400390625, "learning_rate": 0.0001, "loss": 8.2663, "loss/crossentropy": 2.1582404375076294, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2472543865442276, "step": 2480 }, { "epoch": 0.155125, "grad_norm": 3.015625, "grad_norm_var": 0.38386942545572916, "learning_rate": 0.0001, "loss": 8.4017, "loss/crossentropy": 2.1482057571411133, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.280459925532341, "step": 2482 }, { "epoch": 0.15525, "grad_norm": 2.890625, "grad_norm_var": 0.403466796875, "learning_rate": 0.0001, "loss": 8.1928, "loss/crossentropy": 1.9350282549858093, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.25202827900648117, "step": 2484 }, { "epoch": 0.155375, "grad_norm": 3.09375, "grad_norm_var": 0.3456939697265625, "learning_rate": 0.0001, "loss": 8.3613, "loss/crossentropy": 2.187419593334198, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2424585223197937, "step": 2486 }, { "epoch": 0.1555, "grad_norm": 3.15625, "grad_norm_var": 0.3592193603515625, "learning_rate": 0.0001, "loss": 8.5382, "loss/crossentropy": 2.0961918234825134, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.29027004539966583, "step": 2488 }, { "epoch": 0.155625, "grad_norm": 3.046875, "grad_norm_var": 0.3711334228515625, "learning_rate": 0.0001, "loss": 8.5189, "loss/crossentropy": 2.2345021963119507, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2752293795347214, "step": 2490 }, { "epoch": 0.15575, "grad_norm": 3.4375, "grad_norm_var": 0.0588287353515625, "learning_rate": 0.0001, "loss": 8.4591, "loss/crossentropy": 2.0653876066207886, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2703683376312256, "step": 2492 }, { "epoch": 0.155875, "grad_norm": 3.3125, "grad_norm_var": 0.07330729166666666, "learning_rate": 0.0001, "loss": 8.4717, "loss/crossentropy": 2.3052172660827637, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.27236457914114, "step": 2494 }, { "epoch": 0.156, "grad_norm": 3.03125, "grad_norm_var": 0.08242899576822917, "learning_rate": 0.0001, "loss": 8.2526, "loss/crossentropy": 2.190616488456726, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24784858524799347, "step": 2496 }, { "epoch": 0.156125, "grad_norm": 3.046875, "grad_norm_var": 0.07935282389322916, "learning_rate": 0.0001, "loss": 7.9527, "loss/crossentropy": 2.1316753029823303, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2527832090854645, "step": 2498 }, { "epoch": 0.15625, "grad_norm": 3.296875, "grad_norm_var": 0.0719390869140625, "learning_rate": 0.0001, "loss": 8.3477, "loss/crossentropy": 2.1822999119758606, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.274670273065567, "step": 2500 }, { "epoch": 0.156375, "grad_norm": 3.875, "grad_norm_var": 0.09487202962239584, "learning_rate": 0.0001, "loss": 8.0511, "loss/crossentropy": 2.2012258768081665, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2448551505804062, "step": 2502 }, { "epoch": 0.1565, "grad_norm": 3.484375, "grad_norm_var": 0.0845611572265625, "learning_rate": 0.0001, "loss": 8.4234, "loss/crossentropy": 2.2416664361953735, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2832803875207901, "step": 2504 }, { "epoch": 0.156625, "grad_norm": 3.125, "grad_norm_var": 0.07864176432291667, "learning_rate": 0.0001, "loss": 8.2685, "loss/crossentropy": 2.215089440345764, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.25054821372032166, "step": 2506 }, { "epoch": 0.15675, "grad_norm": 4.40625, "grad_norm_var": 0.156982421875, "learning_rate": 0.0001, "loss": 8.3436, "loss/crossentropy": 1.9407853484153748, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.27322302013635635, "step": 2508 }, { "epoch": 0.156875, "grad_norm": 4.1875, "grad_norm_var": 2.074690755208333, "learning_rate": 0.0001, "loss": 8.9861, "loss/crossentropy": 2.4156020879745483, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2951352000236511, "step": 2510 }, { "epoch": 0.157, "grad_norm": 3.296875, "grad_norm_var": 2.0118398030598956, "learning_rate": 0.0001, "loss": 8.8208, "loss/crossentropy": 2.762333035469055, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31056326627731323, "step": 2512 }, { "epoch": 0.157125, "grad_norm": 3.328125, "grad_norm_var": 1.9754557291666666, "learning_rate": 0.0001, "loss": 8.3697, "loss/crossentropy": 2.2353484630584717, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.27381396293640137, "step": 2514 }, { "epoch": 0.15725, "grad_norm": 3.140625, "grad_norm_var": 1.9652994791666667, "learning_rate": 0.0001, "loss": 8.5062, "loss/crossentropy": 2.054705858230591, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2728557288646698, "step": 2516 }, { "epoch": 0.157375, "grad_norm": 3.46875, "grad_norm_var": 1.967186482747396, "learning_rate": 0.0001, "loss": 8.4999, "loss/crossentropy": 2.3050085306167603, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2859141230583191, "step": 2518 }, { "epoch": 0.1575, "grad_norm": 3.453125, "grad_norm_var": 1.9455078125, "learning_rate": 0.0001, "loss": 8.4744, "loss/crossentropy": 2.2714444398880005, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2727499306201935, "step": 2520 }, { "epoch": 0.157625, "grad_norm": 3.0, "grad_norm_var": 1.943626912434896, "learning_rate": 0.0001, "loss": 8.4486, "loss/crossentropy": 2.1054731607437134, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26717132329940796, "step": 2522 }, { "epoch": 0.15775, "grad_norm": 3.046875, "grad_norm_var": 1.9481404622395833, "learning_rate": 0.0001, "loss": 8.2042, "loss/crossentropy": 2.4207626581192017, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2683243751525879, "step": 2524 }, { "epoch": 0.157875, "grad_norm": 3.375, "grad_norm_var": 0.08631184895833334, "learning_rate": 0.0001, "loss": 8.6577, "loss/crossentropy": 2.31955349445343, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2918277531862259, "step": 2526 }, { "epoch": 0.158, "grad_norm": 3.078125, "grad_norm_var": 0.06933492024739583, "learning_rate": 0.0001, "loss": 7.9392, "loss/crossentropy": 2.2313653230667114, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25323386490345, "step": 2528 }, { "epoch": 0.158125, "grad_norm": 2.90625, "grad_norm_var": 0.07825520833333334, "learning_rate": 0.0001, "loss": 8.09, "loss/crossentropy": 2.2791095972061157, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2563246637582779, "step": 2530 }, { "epoch": 0.15825, "grad_norm": 3.421875, "grad_norm_var": 0.08039957682291667, "learning_rate": 0.0001, "loss": 8.1278, "loss/crossentropy": 1.9691064953804016, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27056171745061874, "step": 2532 }, { "epoch": 0.158375, "grad_norm": 3.171875, "grad_norm_var": 0.0800445556640625, "learning_rate": 0.0001, "loss": 8.2542, "loss/crossentropy": 2.3728338479995728, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26400597393512726, "step": 2534 }, { "epoch": 0.1585, "grad_norm": 3.1875, "grad_norm_var": 0.07693684895833333, "learning_rate": 0.0001, "loss": 8.0841, "loss/crossentropy": 2.503694772720337, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.24167175590991974, "step": 2536 }, { "epoch": 0.158625, "grad_norm": 3.265625, "grad_norm_var": 0.07219645182291666, "learning_rate": 0.0001, "loss": 8.2577, "loss/crossentropy": 2.1647502183914185, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2541022449731827, "step": 2538 }, { "epoch": 0.15875, "grad_norm": 3.15625, "grad_norm_var": 0.07073160807291666, "learning_rate": 0.0001, "loss": 8.3393, "loss/crossentropy": 2.0768316984176636, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28541265428066254, "step": 2540 }, { "epoch": 0.158875, "grad_norm": 2.9375, "grad_norm_var": 0.018895467122395832, "learning_rate": 0.0001, "loss": 8.0429, "loss/crossentropy": 1.9995542764663696, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2272045910358429, "step": 2542 }, { "epoch": 0.159, "grad_norm": 3.171875, "grad_norm_var": 0.017903645833333332, "learning_rate": 0.0001, "loss": 8.4371, "loss/crossentropy": 2.2702198028564453, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2836647480726242, "step": 2544 }, { "epoch": 0.159125, "grad_norm": 3.359375, "grad_norm_var": 0.016576131184895832, "learning_rate": 0.0001, "loss": 8.3525, "loss/crossentropy": 2.5021432638168335, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29133598506450653, "step": 2546 }, { "epoch": 0.15925, "grad_norm": 2.875, "grad_norm_var": 0.016974894205729167, "learning_rate": 0.0001, "loss": 8.0069, "loss/crossentropy": 2.066003441810608, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.25314583629369736, "step": 2548 }, { "epoch": 0.159375, "grad_norm": 3.59375, "grad_norm_var": 0.031184895833333334, "learning_rate": 0.0001, "loss": 8.3917, "loss/crossentropy": 2.4096893072128296, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.31454116106033325, "step": 2550 }, { "epoch": 0.1595, "grad_norm": 3.25, "grad_norm_var": 0.06728108723958333, "learning_rate": 0.0001, "loss": 8.1042, "loss/crossentropy": 2.391171097755432, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26810529828071594, "step": 2552 }, { "epoch": 0.159625, "grad_norm": 3.359375, "grad_norm_var": 0.0706939697265625, "learning_rate": 0.0001, "loss": 8.3264, "loss/crossentropy": 2.0131086707115173, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.26509952545166016, "step": 2554 }, { "epoch": 0.15975, "grad_norm": 3.03125, "grad_norm_var": 0.07307535807291667, "learning_rate": 0.0001, "loss": 8.0544, "loss/crossentropy": 2.1483495235443115, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.24957841634750366, "step": 2556 }, { "epoch": 0.159875, "grad_norm": 3.03125, "grad_norm_var": 0.06913655598958333, "learning_rate": 0.0001, "loss": 8.2527, "loss/crossentropy": 2.2075390815734863, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2469852864742279, "step": 2558 }, { "epoch": 0.16, "grad_norm": 4.5, "grad_norm_var": 1.8503977457682292, "learning_rate": 0.0001, "loss": 8.5761, "loss/crossentropy": 2.4395029544830322, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3529563844203949, "step": 2560 }, { "epoch": 0.160125, "grad_norm": 3.09375, "grad_norm_var": 1.855842081705729, "learning_rate": 0.0001, "loss": 8.5514, "loss/crossentropy": 2.6097675561904907, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28696151077747345, "step": 2562 }, { "epoch": 0.16025, "grad_norm": 3.125, "grad_norm_var": 1.84644775390625, "learning_rate": 0.0001, "loss": 8.2892, "loss/crossentropy": 2.155561089515686, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2686978802084923, "step": 2564 }, { "epoch": 0.160375, "grad_norm": 3.671875, "grad_norm_var": 1.81519775390625, "learning_rate": 0.0001, "loss": 8.7252, "loss/crossentropy": 2.568781018257141, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3059556484222412, "step": 2566 }, { "epoch": 0.1605, "grad_norm": 3.09375, "grad_norm_var": 1.8237620035807292, "learning_rate": 0.0001, "loss": 8.1512, "loss/crossentropy": 2.066196024417877, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.23930901288986206, "step": 2568 }, { "epoch": 0.160625, "grad_norm": 3.171875, "grad_norm_var": 1.842724609375, "learning_rate": 0.0001, "loss": 8.1435, "loss/crossentropy": 2.490206003189087, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.27601249516010284, "step": 2570 }, { "epoch": 0.16075, "grad_norm": 2.828125, "grad_norm_var": 1.8820790608723958, "learning_rate": 0.0001, "loss": 8.0316, "loss/crossentropy": 2.1523091793060303, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.23483121395111084, "step": 2572 }, { "epoch": 0.160875, "grad_norm": 3.015625, "grad_norm_var": 1.879596964518229, "learning_rate": 0.0001, "loss": 8.4708, "loss/crossentropy": 2.6995315551757812, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2621884196996689, "step": 2574 }, { "epoch": 0.161, "grad_norm": 3.265625, "grad_norm_var": 0.06481831868489583, "learning_rate": 0.0001, "loss": 8.33, "loss/crossentropy": 2.4481579065322876, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2710493430495262, "step": 2576 }, { "epoch": 0.161125, "grad_norm": 3.234375, "grad_norm_var": 0.058080037434895836, "learning_rate": 0.0001, "loss": 8.4503, "loss/crossentropy": 2.4761546850204468, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.268977627158165, "step": 2578 }, { "epoch": 0.16125, "grad_norm": 2.9375, "grad_norm_var": 0.0637115478515625, "learning_rate": 0.0001, "loss": 8.2223, "loss/crossentropy": 2.2538411617279053, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.23571966588497162, "step": 2580 }, { "epoch": 0.161375, "grad_norm": 3.34375, "grad_norm_var": 0.0407135009765625, "learning_rate": 0.0001, "loss": 8.2121, "loss/crossentropy": 2.2190229892730713, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2735751271247864, "step": 2582 }, { "epoch": 0.1615, "grad_norm": 3.3125, "grad_norm_var": 0.04039713541666667, "learning_rate": 0.0001, "loss": 8.2551, "loss/crossentropy": 1.9724391102790833, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.25333186984062195, "step": 2584 }, { "epoch": 0.161625, "grad_norm": 3.890625, "grad_norm_var": 0.07361551920572916, "learning_rate": 0.0001, "loss": 8.806, "loss/crossentropy": 2.349689483642578, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28091633319854736, "step": 2586 }, { "epoch": 0.16175, "grad_norm": 3.203125, "grad_norm_var": 0.059601847330729166, "learning_rate": 0.0001, "loss": 8.2256, "loss/crossentropy": 2.1112502217292786, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.27882492542266846, "step": 2588 }, { "epoch": 0.161875, "grad_norm": 3.265625, "grad_norm_var": 0.0524322509765625, "learning_rate": 0.0001, "loss": 8.2174, "loss/crossentropy": 2.113717198371887, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2782783955335617, "step": 2590 }, { "epoch": 0.162, "grad_norm": 10.3125, "grad_norm_var": 3.1836822509765623, "learning_rate": 0.0001, "loss": 8.331, "loss/crossentropy": 2.2447162866592407, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.24347630143165588, "step": 2592 }, { "epoch": 0.162125, "grad_norm": 3.546875, "grad_norm_var": 3.16724853515625, "learning_rate": 0.0001, "loss": 8.5865, "loss/crossentropy": 2.2667943239212036, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29205353558063507, "step": 2594 }, { "epoch": 0.16225, "grad_norm": 3.21875, "grad_norm_var": 3.12626953125, "learning_rate": 0.0001, "loss": 8.3279, "loss/crossentropy": 2.4091285467147827, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2889806926250458, "step": 2596 }, { "epoch": 0.162375, "grad_norm": 4.90625, "grad_norm_var": 8.08433837890625, "learning_rate": 0.0001, "loss": 8.9674, "loss/crossentropy": 2.1516201496124268, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.27722157537937164, "step": 2598 }, { "epoch": 0.1625, "grad_norm": 2.875, "grad_norm_var": 8.181541951497396, "learning_rate": 0.0001, "loss": 8.2947, "loss/crossentropy": 2.1308915615081787, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.25969892740249634, "step": 2600 }, { "epoch": 0.162625, "grad_norm": 3.359375, "grad_norm_var": 8.16597900390625, "learning_rate": 0.0001, "loss": 8.3681, "loss/crossentropy": 2.5972925424575806, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.29456309974193573, "step": 2602 }, { "epoch": 0.16275, "grad_norm": 3.0625, "grad_norm_var": 8.18491923014323, "learning_rate": 0.0001, "loss": 8.5503, "loss/crossentropy": 1.8861650228500366, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2846178114414215, "step": 2604 }, { "epoch": 0.162875, "grad_norm": 3.28125, "grad_norm_var": 8.164111328125, "learning_rate": 0.0001, "loss": 8.6373, "loss/crossentropy": 2.2498600482940674, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2907450199127197, "step": 2606 }, { "epoch": 0.163, "grad_norm": 3.28125, "grad_norm_var": 5.6742909749348955, "learning_rate": 0.0001, "loss": 8.1804, "loss/crossentropy": 2.1924002170562744, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.2556323930621147, "step": 2608 }, { "epoch": 0.163125, "grad_norm": 3.0625, "grad_norm_var": 5.754759724934896, "learning_rate": 0.0001, "loss": 8.2493, "loss/crossentropy": 2.2142067551612854, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.24400854110717773, "step": 2610 }, { "epoch": 0.16325, "grad_norm": 2.953125, "grad_norm_var": 5.752632649739583, "learning_rate": 0.0001, "loss": 8.1131, "loss/crossentropy": 2.1017364263534546, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.25923022627830505, "step": 2612 }, { "epoch": 0.163375, "grad_norm": 3.078125, "grad_norm_var": 0.04047749837239583, "learning_rate": 0.0001, "loss": 7.9315, "loss/crossentropy": 2.040058970451355, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2411157488822937, "step": 2614 }, { "epoch": 0.1635, "grad_norm": 2.890625, "grad_norm_var": 0.04127197265625, "learning_rate": 0.0001, "loss": 8.0279, "loss/crossentropy": 2.126166343688965, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2715003341436386, "step": 2616 }, { "epoch": 0.163625, "grad_norm": 3.390625, "grad_norm_var": 0.03964436848958333, "learning_rate": 0.0001, "loss": 8.5822, "loss/crossentropy": 2.304844081401825, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2995966821908951, "step": 2618 }, { "epoch": 0.16375, "grad_norm": 3.21875, "grad_norm_var": 0.03892313639322917, "learning_rate": 0.0001, "loss": 8.2578, "loss/crossentropy": 2.249672293663025, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26482951641082764, "step": 2620 }, { "epoch": 0.163875, "grad_norm": 3.140625, "grad_norm_var": 0.035595703125, "learning_rate": 0.0001, "loss": 8.2427, "loss/crossentropy": 1.9495037198066711, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.245466411113739, "step": 2622 }, { "epoch": 0.164, "grad_norm": 3.078125, "grad_norm_var": 0.03412984212239583, "learning_rate": 0.0001, "loss": 8.7238, "loss/crossentropy": 2.1025261878967285, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.36893585324287415, "step": 2624 }, { "epoch": 0.164125, "grad_norm": 3.0, "grad_norm_var": 0.0374176025390625, "learning_rate": 0.0001, "loss": 7.9891, "loss/crossentropy": 2.166743278503418, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26486510038375854, "step": 2626 }, { "epoch": 0.16425, "grad_norm": 3.078125, "grad_norm_var": 0.020003255208333334, "learning_rate": 0.0001, "loss": 8.2195, "loss/crossentropy": 1.8708640336990356, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.29897913336753845, "step": 2628 }, { "epoch": 0.164375, "grad_norm": 3.046875, "grad_norm_var": 0.016682942708333332, "learning_rate": 0.0001, "loss": 8.1536, "loss/crossentropy": 2.2009392380714417, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.24113795161247253, "step": 2630 }, { "epoch": 0.1645, "grad_norm": 3.09375, "grad_norm_var": 0.014964803059895834, "learning_rate": 0.0001, "loss": 7.9848, "loss/crossentropy": 2.154388189315796, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2846536338329315, "step": 2632 }, { "epoch": 0.164625, "grad_norm": 3.03125, "grad_norm_var": 0.0490631103515625, "learning_rate": 0.0001, "loss": 8.3996, "loss/crossentropy": 2.2005971670150757, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2885949909687042, "step": 2634 }, { "epoch": 0.16475, "grad_norm": 2.6875, "grad_norm_var": 0.05823160807291667, "learning_rate": 0.0001, "loss": 8.2362, "loss/crossentropy": 2.133545219898224, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2844843938946724, "step": 2636 }, { "epoch": 0.164875, "grad_norm": 3.78125, "grad_norm_var": 0.08993733723958333, "learning_rate": 0.0001, "loss": 8.2067, "loss/crossentropy": 2.3494282960891724, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2750708758831024, "step": 2638 }, { "epoch": 0.165, "grad_norm": 2.96875, "grad_norm_var": 0.122216796875, "learning_rate": 0.0001, "loss": 8.0289, "loss/crossentropy": 2.24998676776886, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2661600410938263, "step": 2640 }, { "epoch": 0.165125, "grad_norm": 2.921875, "grad_norm_var": 0.1233062744140625, "learning_rate": 0.0001, "loss": 8.0228, "loss/crossentropy": 2.5275110006332397, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.288016676902771, "step": 2642 }, { "epoch": 0.16525, "grad_norm": 8.375, "grad_norm_var": 1.8246378580729166, "learning_rate": 0.0001, "loss": 8.4217, "loss/crossentropy": 2.4159642457962036, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3448687344789505, "step": 2644 }, { "epoch": 0.165375, "grad_norm": 3.15625, "grad_norm_var": 1.8047515869140625, "learning_rate": 0.0001, "loss": 8.3375, "loss/crossentropy": 2.067953109741211, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.23754960298538208, "step": 2646 }, { "epoch": 0.1655, "grad_norm": 3.109375, "grad_norm_var": 1.8573893229166667, "learning_rate": 0.0001, "loss": 8.1631, "loss/crossentropy": 2.0251219272613525, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.24935762584209442, "step": 2648 }, { "epoch": 0.165625, "grad_norm": 3.203125, "grad_norm_var": 1.8426666259765625, "learning_rate": 0.0001, "loss": 8.2755, "loss/crossentropy": 2.2760668992996216, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.26836584508419037, "step": 2650 }, { "epoch": 0.16575, "grad_norm": 3.3125, "grad_norm_var": 1.7913238525390625, "learning_rate": 0.0001, "loss": 8.2562, "loss/crossentropy": 2.1319936513900757, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.30381089448928833, "step": 2652 }, { "epoch": 0.165875, "grad_norm": 3.328125, "grad_norm_var": 1.7715321858723958, "learning_rate": 0.0001, "loss": 8.2505, "loss/crossentropy": 2.256405830383301, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.25285232812166214, "step": 2654 }, { "epoch": 0.166, "grad_norm": 3.234375, "grad_norm_var": 1.761310831705729, "learning_rate": 0.0001, "loss": 8.4639, "loss/crossentropy": 2.054237425327301, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.286573126912117, "step": 2656 }, { "epoch": 0.166125, "grad_norm": 2.859375, "grad_norm_var": 1.7568756103515626, "learning_rate": 0.0001, "loss": 8.1282, "loss/crossentropy": 2.1406211853027344, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.24495816975831985, "step": 2658 }, { "epoch": 0.16625, "grad_norm": 3.328125, "grad_norm_var": 0.17919514973958334, "learning_rate": 0.0001, "loss": 8.1781, "loss/crossentropy": 2.0538607835769653, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.26996807008981705, "step": 2660 }, { "epoch": 0.166375, "grad_norm": 3.15625, "grad_norm_var": 0.18124593098958333, "learning_rate": 0.0001, "loss": 8.4031, "loss/crossentropy": 1.9715936779975891, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.3107060194015503, "step": 2662 }, { "epoch": 0.1665, "grad_norm": 3.546875, "grad_norm_var": 0.06520182291666667, "learning_rate": 0.0001, "loss": 8.6967, "loss/crossentropy": 2.5435596704483032, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3240128457546234, "step": 2664 }, { "epoch": 0.166625, "grad_norm": 3.6875, "grad_norm_var": 0.06159566243489583, "learning_rate": 0.0001, "loss": 8.2558, "loss/crossentropy": 2.149868607521057, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.29491525888442993, "step": 2666 }, { "epoch": 0.16675, "grad_norm": 2.765625, "grad_norm_var": 0.09220377604166667, "learning_rate": 0.0001, "loss": 8.1104, "loss/crossentropy": 2.378232002258301, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.23320211470127106, "step": 2668 }, { "epoch": 0.166875, "grad_norm": 4.96875, "grad_norm_var": 1.3787994384765625, "learning_rate": 0.0001, "loss": 8.275, "loss/crossentropy": 2.297814965248108, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.27089808881282806, "step": 2670 }, { "epoch": 0.167, "grad_norm": 3.046875, "grad_norm_var": 1.4005696614583334, "learning_rate": 0.0001, "loss": 8.3007, "loss/crossentropy": 2.2771732807159424, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.26965174078941345, "step": 2672 }, { "epoch": 0.167125, "grad_norm": 3.4375, "grad_norm_var": 1.3486073811848958, "learning_rate": 0.0001, "loss": 8.241, "loss/crossentropy": 2.2909536361694336, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.25784504413604736, "step": 2674 }, { "epoch": 0.16725, "grad_norm": 2.921875, "grad_norm_var": 1.38717041015625, "learning_rate": 0.0001, "loss": 8.2849, "loss/crossentropy": 2.165972590446472, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.32859523594379425, "step": 2676 }, { "epoch": 0.167375, "grad_norm": 3.21875, "grad_norm_var": 1.3915323893229166, "learning_rate": 0.0001, "loss": 8.258, "loss/crossentropy": 2.1997212171554565, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28193938732147217, "step": 2678 }, { "epoch": 0.1675, "grad_norm": 3.296875, "grad_norm_var": 1.4145985921223958, "learning_rate": 0.0001, "loss": 8.2716, "loss/crossentropy": 1.921266257762909, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2643412724137306, "step": 2680 }, { "epoch": 0.167625, "grad_norm": 3.546875, "grad_norm_var": 1.4224029541015626, "learning_rate": 0.0001, "loss": 8.2971, "loss/crossentropy": 2.499913454055786, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.26486673206090927, "step": 2682 }, { "epoch": 0.16775, "grad_norm": 3.21875, "grad_norm_var": 1.3593007405598958, "learning_rate": 0.0001, "loss": 8.5774, "loss/crossentropy": 2.460092782974243, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27513206005096436, "step": 2684 }, { "epoch": 0.167875, "grad_norm": 3.859375, "grad_norm_var": 0.06664937337239583, "learning_rate": 0.0001, "loss": 8.1398, "loss/crossentropy": 2.0256370902061462, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.25673844665288925, "step": 2686 }, { "epoch": 0.168, "grad_norm": 3.34375, "grad_norm_var": 0.06943257649739583, "learning_rate": 0.0001, "loss": 8.1891, "loss/crossentropy": 2.2106178998947144, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2606198415160179, "step": 2688 }, { "epoch": 0.168125, "grad_norm": 3.171875, "grad_norm_var": 0.15825093587239583, "learning_rate": 0.0001, "loss": 8.4468, "loss/crossentropy": 2.4705090522766113, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3326205313205719, "step": 2690 }, { "epoch": 0.16825, "grad_norm": 3.0, "grad_norm_var": 0.24257405598958334, "learning_rate": 0.0001, "loss": 8.3019, "loss/crossentropy": 2.1301801204681396, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2574753388762474, "step": 2692 }, { "epoch": 0.168375, "grad_norm": 17.0, "grad_norm_var": 11.74107666015625, "learning_rate": 0.0001, "loss": 8.3817, "loss/crossentropy": 2.2080469131469727, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.25753118097782135, "step": 2694 }, { "epoch": 0.1685, "grad_norm": 3.0, "grad_norm_var": 11.712580362955729, "learning_rate": 0.0001, "loss": 8.1613, "loss/crossentropy": 2.4652985334396362, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26353147625923157, "step": 2696 }, { "epoch": 0.168625, "grad_norm": 3.140625, "grad_norm_var": 11.71861063639323, "learning_rate": 0.0001, "loss": 8.2969, "loss/crossentropy": 2.253800868988037, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.24178771674633026, "step": 2698 }, { "epoch": 0.16875, "grad_norm": 3.328125, "grad_norm_var": 11.747981770833333, "learning_rate": 0.0001, "loss": 8.2524, "loss/crossentropy": 2.0208754539489746, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24065013974905014, "step": 2700 }, { "epoch": 0.168875, "grad_norm": 3.5625, "grad_norm_var": 11.730045572916667, "learning_rate": 0.0001, "loss": 8.3332, "loss/crossentropy": 2.3737761974334717, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26338720321655273, "step": 2702 }, { "epoch": 0.169, "grad_norm": 2.71875, "grad_norm_var": 11.904715983072917, "learning_rate": 0.0001, "loss": 8.1516, "loss/crossentropy": 2.3044910430908203, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.27600476145744324, "step": 2704 }, { "epoch": 0.169125, "grad_norm": 3.21875, "grad_norm_var": 11.987723795572917, "learning_rate": 0.0001, "loss": 8.2153, "loss/crossentropy": 2.2738078832626343, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24632424116134644, "step": 2706 }, { "epoch": 0.16925, "grad_norm": 2.96875, "grad_norm_var": 12.057062784830729, "learning_rate": 0.0001, "loss": 8.4078, "loss/crossentropy": 2.501603364944458, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26396501064300537, "step": 2708 }, { "epoch": 0.169375, "grad_norm": 3.546875, "grad_norm_var": 0.06502176920572916, "learning_rate": 0.0001, "loss": 8.1556, "loss/crossentropy": 2.2160197496414185, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2618914991617203, "step": 2710 }, { "epoch": 0.1695, "grad_norm": 3.625, "grad_norm_var": 0.07457275390625, "learning_rate": 0.0001, "loss": 8.2313, "loss/crossentropy": 2.0781534910202026, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2430388554930687, "step": 2712 }, { "epoch": 0.169625, "grad_norm": 3.21875, "grad_norm_var": 0.07528889973958333, "learning_rate": 0.0001, "loss": 8.4484, "loss/crossentropy": 2.2283905744552612, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2664916515350342, "step": 2714 }, { "epoch": 0.16975, "grad_norm": 3.1875, "grad_norm_var": 0.07080078125, "learning_rate": 0.0001, "loss": 8.1717, "loss/crossentropy": 2.0941238403320312, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30617423355579376, "step": 2716 }, { "epoch": 0.169875, "grad_norm": 3.234375, "grad_norm_var": 0.0599273681640625, "learning_rate": 0.0001, "loss": 8.3934, "loss/crossentropy": 2.053987979888916, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2649051547050476, "step": 2718 }, { "epoch": 0.17, "grad_norm": 3.84375, "grad_norm_var": 0.06912333170572917, "learning_rate": 0.0001, "loss": 8.4367, "loss/crossentropy": 2.317259907722473, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2616223096847534, "step": 2720 }, { "epoch": 0.170125, "grad_norm": 3.265625, "grad_norm_var": 0.06263020833333334, "learning_rate": 0.0001, "loss": 8.4487, "loss/crossentropy": 1.976955771446228, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.25816483795642853, "step": 2722 }, { "epoch": 0.17025, "grad_norm": 7.875, "grad_norm_var": 1.3464192708333333, "learning_rate": 0.0001, "loss": 8.5821, "loss/crossentropy": 2.294445514678955, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26846031844615936, "step": 2724 }, { "epoch": 0.170375, "grad_norm": 3.0625, "grad_norm_var": 1.3478098551432292, "learning_rate": 0.0001, "loss": 8.3747, "loss/crossentropy": 2.114520013332367, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26021555066108704, "step": 2726 }, { "epoch": 0.1705, "grad_norm": 3.375, "grad_norm_var": 1.3656483968098958, "learning_rate": 0.0001, "loss": 8.1332, "loss/crossentropy": 2.2090890407562256, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.26450634002685547, "step": 2728 }, { "epoch": 0.170625, "grad_norm": 3.125, "grad_norm_var": 1.387548828125, "learning_rate": 0.0001, "loss": 8.098, "loss/crossentropy": 2.155696392059326, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2571106255054474, "step": 2730 }, { "epoch": 0.17075, "grad_norm": 3.125, "grad_norm_var": 1.3824503580729166, "learning_rate": 0.0001, "loss": 8.3749, "loss/crossentropy": 2.2427806854248047, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2668910622596741, "step": 2732 }, { "epoch": 0.170875, "grad_norm": 3.203125, "grad_norm_var": 1.3876953125, "learning_rate": 0.0001, "loss": 8.3086, "loss/crossentropy": 2.316579580307007, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2466164082288742, "step": 2734 }, { "epoch": 0.171, "grad_norm": 3.078125, "grad_norm_var": 1.381004842122396, "learning_rate": 0.0001, "loss": 8.1225, "loss/crossentropy": 2.167383313179016, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.24197402596473694, "step": 2736 }, { "epoch": 0.171125, "grad_norm": 2.84375, "grad_norm_var": 1.4105214436848958, "learning_rate": 0.0001, "loss": 8.0321, "loss/crossentropy": 2.051010310649872, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.230230450630188, "step": 2738 }, { "epoch": 0.17125, "grad_norm": 3.0625, "grad_norm_var": 0.033080037434895834, "learning_rate": 0.0001, "loss": 8.0972, "loss/crossentropy": 2.1876128911972046, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.28294453024864197, "step": 2740 }, { "epoch": 0.171375, "grad_norm": 3.140625, "grad_norm_var": 0.026488240559895834, "learning_rate": 0.0001, "loss": 8.3365, "loss/crossentropy": 2.6002990007400513, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2489309012889862, "step": 2742 }, { "epoch": 0.1715, "grad_norm": 3.734375, "grad_norm_var": 0.04570210774739583, "learning_rate": 0.0001, "loss": 8.145, "loss/crossentropy": 2.4337987899780273, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.27483487129211426, "step": 2744 }, { "epoch": 0.171625, "grad_norm": 3.125, "grad_norm_var": 0.0450103759765625, "learning_rate": 0.0001, "loss": 8.3908, "loss/crossentropy": 2.3473914861679077, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27473725378513336, "step": 2746 }, { "epoch": 0.17175, "grad_norm": 4.75, "grad_norm_var": 0.20499674479166666, "learning_rate": 0.0001, "loss": 8.2717, "loss/crossentropy": 2.3052932024002075, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26262688636779785, "step": 2748 }, { "epoch": 0.171875, "grad_norm": 3.359375, "grad_norm_var": 0.20078125, "learning_rate": 0.0001, "loss": 8.3174, "loss/crossentropy": 2.3352149724960327, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.273982509970665, "step": 2750 }, { "epoch": 0.172, "grad_norm": 3.25, "grad_norm_var": 0.19846598307291666, "learning_rate": 0.0001, "loss": 8.1853, "loss/crossentropy": 2.140386462211609, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2775571495294571, "step": 2752 }, { "epoch": 0.172125, "grad_norm": 3.015625, "grad_norm_var": 0.19595947265625, "learning_rate": 0.0001, "loss": 8.2475, "loss/crossentropy": 2.207223057746887, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28297314047813416, "step": 2754 }, { "epoch": 0.17225, "grad_norm": 4.0, "grad_norm_var": 0.22538960774739583, "learning_rate": 0.0001, "loss": 8.202, "loss/crossentropy": 2.0487744212150574, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.24457374960184097, "step": 2756 }, { "epoch": 0.172375, "grad_norm": 2.84375, "grad_norm_var": 0.22965087890625, "learning_rate": 0.0001, "loss": 8.2643, "loss/crossentropy": 2.0312034487724304, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2587245851755142, "step": 2758 }, { "epoch": 0.1725, "grad_norm": 3.25, "grad_norm_var": 0.22076822916666666, "learning_rate": 0.0001, "loss": 8.0611, "loss/crossentropy": 2.159967541694641, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2607487440109253, "step": 2760 }, { "epoch": 0.172625, "grad_norm": 4.84375, "grad_norm_var": 0.6103993733723958, "learning_rate": 0.0001, "loss": 8.5833, "loss/crossentropy": 2.3289551734924316, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.305271714925766, "step": 2762 }, { "epoch": 0.17275, "grad_norm": 3.28125, "grad_norm_var": 0.50279541015625, "learning_rate": 0.0001, "loss": 8.2619, "loss/crossentropy": 2.417571187019348, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.30440282821655273, "step": 2764 }, { "epoch": 0.172875, "grad_norm": 3.234375, "grad_norm_var": 0.5197255452473958, "learning_rate": 0.0001, "loss": 8.2956, "loss/crossentropy": 2.2623164653778076, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3535146266222, "step": 2766 }, { "epoch": 0.173, "grad_norm": 3.171875, "grad_norm_var": 0.5424224853515625, "learning_rate": 0.0001, "loss": 7.9312, "loss/crossentropy": 2.2652758359909058, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24810383468866348, "step": 2768 }, { "epoch": 0.173125, "grad_norm": 3.25, "grad_norm_var": 0.51298828125, "learning_rate": 0.0001, "loss": 7.9873, "loss/crossentropy": 1.8157271146774292, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.2872622609138489, "step": 2770 }, { "epoch": 0.17325, "grad_norm": 3.21875, "grad_norm_var": 0.5064442952473959, "learning_rate": 0.0001, "loss": 8.1766, "loss/crossentropy": 2.367117762565613, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2569321021437645, "step": 2772 }, { "epoch": 0.173375, "grad_norm": 2.875, "grad_norm_var": 0.5081614176432292, "learning_rate": 0.0001, "loss": 8.0539, "loss/crossentropy": 2.406521797180176, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2661485821008682, "step": 2774 }, { "epoch": 0.1735, "grad_norm": 2.9375, "grad_norm_var": 0.5268870035807292, "learning_rate": 0.0001, "loss": 8.1589, "loss/crossentropy": 2.2054529190063477, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2522689700126648, "step": 2776 }, { "epoch": 0.173625, "grad_norm": 3.28125, "grad_norm_var": 0.047835286458333334, "learning_rate": 0.0001, "loss": 8.224, "loss/crossentropy": 2.1260339617729187, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2733009606599808, "step": 2778 }, { "epoch": 0.17375, "grad_norm": 3.0, "grad_norm_var": 0.04888916015625, "learning_rate": 0.0001, "loss": 8.4169, "loss/crossentropy": 2.4982125759124756, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27119141817092896, "step": 2780 }, { "epoch": 0.173875, "grad_norm": 3.34375, "grad_norm_var": 0.04927978515625, "learning_rate": 0.0001, "loss": 8.0219, "loss/crossentropy": 2.1972113251686096, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26067984104156494, "step": 2782 }, { "epoch": 0.174, "grad_norm": 2.84375, "grad_norm_var": 0.05, "learning_rate": 0.0001, "loss": 8.1459, "loss/crossentropy": 2.0820754766464233, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2678705006837845, "step": 2784 }, { "epoch": 0.174125, "grad_norm": 2.984375, "grad_norm_var": 0.053759765625, "learning_rate": 0.0001, "loss": 7.9487, "loss/crossentropy": 2.3287781476974487, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2594810128211975, "step": 2786 }, { "epoch": 0.17425, "grad_norm": 3.109375, "grad_norm_var": 0.06402079264322917, "learning_rate": 0.0001, "loss": 8.0273, "loss/crossentropy": 2.073160171508789, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2546188533306122, "step": 2788 }, { "epoch": 0.174375, "grad_norm": 2.734375, "grad_norm_var": 0.07014567057291667, "learning_rate": 0.0001, "loss": 8.0536, "loss/crossentropy": 2.318626046180725, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2673945128917694, "step": 2790 }, { "epoch": 0.1745, "grad_norm": 2.9375, "grad_norm_var": 0.07393290201822916, "learning_rate": 0.0001, "loss": 8.2079, "loss/crossentropy": 2.038848638534546, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.252300888299942, "step": 2792 }, { "epoch": 0.174625, "grad_norm": 2.953125, "grad_norm_var": 0.051667277018229166, "learning_rate": 0.0001, "loss": 8.0414, "loss/crossentropy": 2.451392412185669, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27607588469982147, "step": 2794 }, { "epoch": 0.17475, "grad_norm": 2.6875, "grad_norm_var": 0.059733072916666664, "learning_rate": 0.0001, "loss": 7.8686, "loss/crossentropy": 2.328820824623108, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27283255755901337, "step": 2796 }, { "epoch": 0.174875, "grad_norm": 4.09375, "grad_norm_var": 0.13127848307291667, "learning_rate": 0.0001, "loss": 8.6859, "loss/crossentropy": 2.4796829223632812, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2798849195241928, "step": 2798 }, { "epoch": 0.175, "grad_norm": 3.09375, "grad_norm_var": 0.14205322265625, "learning_rate": 0.0001, "loss": 8.2104, "loss/crossentropy": 2.399091958999634, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2533615976572037, "step": 2800 }, { "epoch": 0.175125, "grad_norm": 3.28125, "grad_norm_var": 0.1294830322265625, "learning_rate": 0.0001, "loss": 8.3688, "loss/crossentropy": 2.484113931655884, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2587704509496689, "step": 2802 }, { "epoch": 0.17525, "grad_norm": 3.25, "grad_norm_var": 0.12359619140625, "learning_rate": 0.0001, "loss": 8.1114, "loss/crossentropy": 2.153530240058899, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2764217406511307, "step": 2804 }, { "epoch": 0.175375, "grad_norm": 5.03125, "grad_norm_var": 0.34517313639322916, "learning_rate": 0.0001, "loss": 8.5133, "loss/crossentropy": 2.510820746421814, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28840383887290955, "step": 2806 }, { "epoch": 0.1755, "grad_norm": 3.78125, "grad_norm_var": 0.34462890625, "learning_rate": 0.0001, "loss": 8.3656, "loss/crossentropy": 2.233067274093628, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.27428072690963745, "step": 2808 }, { "epoch": 0.175625, "grad_norm": 3.203125, "grad_norm_var": 0.33743387858072915, "learning_rate": 0.0001, "loss": 8.3259, "loss/crossentropy": 2.4839993715286255, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2870935648679733, "step": 2810 }, { "epoch": 0.17575, "grad_norm": 3.421875, "grad_norm_var": 0.29002176920572914, "learning_rate": 0.0001, "loss": 8.2811, "loss/crossentropy": 2.1913881301879883, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2708476558327675, "step": 2812 }, { "epoch": 0.175875, "grad_norm": 3.015625, "grad_norm_var": 0.26023661295572914, "learning_rate": 0.0001, "loss": 8.0784, "loss/crossentropy": 1.9806787967681885, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2621997892856598, "step": 2814 }, { "epoch": 0.176, "grad_norm": 3.25, "grad_norm_var": 0.26220601399739585, "learning_rate": 0.0001, "loss": 8.0237, "loss/crossentropy": 2.1387823820114136, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.26336149126291275, "step": 2816 }, { "epoch": 0.176125, "grad_norm": 2.828125, "grad_norm_var": 0.26614176432291664, "learning_rate": 0.0001, "loss": 8.2939, "loss/crossentropy": 2.361487627029419, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2560403645038605, "step": 2818 }, { "epoch": 0.17625, "grad_norm": 3.21875, "grad_norm_var": 0.2558502197265625, "learning_rate": 0.0001, "loss": 8.0806, "loss/crossentropy": 2.1189752221107483, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25846994668245316, "step": 2820 }, { "epoch": 0.176375, "grad_norm": 2.890625, "grad_norm_var": 0.05779622395833333, "learning_rate": 0.0001, "loss": 7.8692, "loss/crossentropy": 1.9953218698501587, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.25956152379512787, "step": 2822 }, { "epoch": 0.1765, "grad_norm": 3.234375, "grad_norm_var": 0.03194071451822917, "learning_rate": 0.0001, "loss": 8.0475, "loss/crossentropy": 1.9369473457336426, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24360457062721252, "step": 2824 }, { "epoch": 0.176625, "grad_norm": 3.546875, "grad_norm_var": 0.0423248291015625, "learning_rate": 0.0001, "loss": 8.3675, "loss/crossentropy": 2.396170139312744, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26752087473869324, "step": 2826 }, { "epoch": 0.17675, "grad_norm": 3.203125, "grad_norm_var": 0.03780924479166667, "learning_rate": 0.0001, "loss": 8.3014, "loss/crossentropy": 2.2906793355941772, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.36236336827278137, "step": 2828 }, { "epoch": 0.176875, "grad_norm": 2.90625, "grad_norm_var": 0.041162109375, "learning_rate": 0.0001, "loss": 8.1376, "loss/crossentropy": 2.218887209892273, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.24862568825483322, "step": 2830 }, { "epoch": 0.177, "grad_norm": 2.890625, "grad_norm_var": 0.04208577473958333, "learning_rate": 0.0001, "loss": 8.163, "loss/crossentropy": 2.0639127492904663, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27621056139469147, "step": 2832 }, { "epoch": 0.177125, "grad_norm": 3.125, "grad_norm_var": 0.25239969889322916, "learning_rate": 0.0001, "loss": 8.3628, "loss/crossentropy": 2.087058901786804, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.26111891865730286, "step": 2834 }, { "epoch": 0.17725, "grad_norm": 3.0625, "grad_norm_var": 0.26060791015625, "learning_rate": 0.0001, "loss": 8.267, "loss/crossentropy": 2.103839159011841, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.28338518738746643, "step": 2836 }, { "epoch": 0.177375, "grad_norm": 2.96875, "grad_norm_var": 0.2627838134765625, "learning_rate": 0.0001, "loss": 8.2432, "loss/crossentropy": 2.3400384187698364, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2456740438938141, "step": 2838 }, { "epoch": 0.1775, "grad_norm": 3.109375, "grad_norm_var": 0.2857421875, "learning_rate": 0.0001, "loss": 8.007, "loss/crossentropy": 2.3000999689102173, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2595764845609665, "step": 2840 }, { "epoch": 0.177625, "grad_norm": 3.4375, "grad_norm_var": 0.28277587890625, "learning_rate": 0.0001, "loss": 8.2928, "loss/crossentropy": 2.559830904006958, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2871163934469223, "step": 2842 }, { "epoch": 0.17775, "grad_norm": 3.875, "grad_norm_var": 10.18516337076823, "learning_rate": 0.0001, "loss": 8.5555, "loss/crossentropy": 2.147689163684845, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3283799886703491, "step": 2844 }, { "epoch": 0.177875, "grad_norm": 3.5625, "grad_norm_var": 19.36314188639323, "learning_rate": 0.0001, "loss": 8.8889, "loss/crossentropy": 2.7562062740325928, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3979947119951248, "step": 2846 }, { "epoch": 0.178, "grad_norm": 3.015625, "grad_norm_var": 19.32244364420573, "learning_rate": 0.0001, "loss": 8.1916, "loss/crossentropy": 1.9290424585342407, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.24876631796360016, "step": 2848 }, { "epoch": 0.178125, "grad_norm": 3.15625, "grad_norm_var": 19.54898681640625, "learning_rate": 0.0001, "loss": 8.2663, "loss/crossentropy": 2.2004363536834717, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2261395901441574, "step": 2850 }, { "epoch": 0.17825, "grad_norm": 3.609375, "grad_norm_var": 19.517594401041666, "learning_rate": 0.0001, "loss": 8.1784, "loss/crossentropy": 2.380824565887451, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26775553077459335, "step": 2852 }, { "epoch": 0.178375, "grad_norm": 3.125, "grad_norm_var": 19.418876139322915, "learning_rate": 0.0001, "loss": 8.04, "loss/crossentropy": 2.365883946418762, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.23994451016187668, "step": 2854 }, { "epoch": 0.1785, "grad_norm": 3.203125, "grad_norm_var": 19.283527628580728, "learning_rate": 0.0001, "loss": 8.408, "loss/crossentropy": 2.2959755659103394, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.23302578926086426, "step": 2856 }, { "epoch": 0.178625, "grad_norm": 2.890625, "grad_norm_var": 19.461302693684896, "learning_rate": 0.0001, "loss": 7.9811, "loss/crossentropy": 2.177852749824524, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2694682776927948, "step": 2858 }, { "epoch": 0.17875, "grad_norm": 3.375, "grad_norm_var": 10.949723307291666, "learning_rate": 0.0001, "loss": 8.1998, "loss/crossentropy": 2.385365605354309, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2615288197994232, "step": 2860 }, { "epoch": 0.178875, "grad_norm": 3.125, "grad_norm_var": 0.02935791015625, "learning_rate": 0.0001, "loss": 8.0819, "loss/crossentropy": 2.107097029685974, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2600784972310066, "step": 2862 }, { "epoch": 0.179, "grad_norm": 3.21875, "grad_norm_var": 0.029027303059895832, "learning_rate": 0.0001, "loss": 8.3424, "loss/crossentropy": 2.4825806617736816, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2492770403623581, "step": 2864 }, { "epoch": 0.179125, "grad_norm": 3.09375, "grad_norm_var": 0.0363922119140625, "learning_rate": 0.0001, "loss": 8.015, "loss/crossentropy": 2.376753568649292, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.24921638518571854, "step": 2866 }, { "epoch": 0.17925, "grad_norm": 3.375, "grad_norm_var": 0.03062744140625, "learning_rate": 0.0001, "loss": 8.0536, "loss/crossentropy": 1.8834964036941528, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.24860648065805435, "step": 2868 }, { "epoch": 0.179375, "grad_norm": 5.25, "grad_norm_var": 0.33015848795572916, "learning_rate": 0.0001, "loss": 8.4645, "loss/crossentropy": 2.091266393661499, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2547171041369438, "step": 2870 }, { "epoch": 0.1795, "grad_norm": 3.828125, "grad_norm_var": 0.35742085774739585, "learning_rate": 0.0001, "loss": 8.2093, "loss/crossentropy": 2.4292666912078857, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2616816312074661, "step": 2872 }, { "epoch": 0.179625, "grad_norm": 2.875, "grad_norm_var": 0.3619618733723958, "learning_rate": 0.0001, "loss": 8.0421, "loss/crossentropy": 1.9631532430648804, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2373322695493698, "step": 2874 }, { "epoch": 0.17975, "grad_norm": 3.015625, "grad_norm_var": 0.366357421875, "learning_rate": 0.0001, "loss": 8.1007, "loss/crossentropy": 2.33024525642395, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.25947806239128113, "step": 2876 }, { "epoch": 0.179875, "grad_norm": 3.046875, "grad_norm_var": 0.36554361979166666, "learning_rate": 0.0001, "loss": 8.179, "loss/crossentropy": 2.23485791683197, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.23906433582305908, "step": 2878 }, { "epoch": 0.18, "grad_norm": 3.40625, "grad_norm_var": 0.36677958170572916, "learning_rate": 0.0001, "loss": 8.1053, "loss/crossentropy": 1.9656097888946533, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.21824423223733902, "step": 2880 }, { "epoch": 0.180125, "grad_norm": 3.890625, "grad_norm_var": 0.3694000244140625, "learning_rate": 0.0001, "loss": 8.1386, "loss/crossentropy": 2.2142436504364014, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2432505339384079, "step": 2882 }, { "epoch": 0.18025, "grad_norm": 2.921875, "grad_norm_var": 0.36314697265625, "learning_rate": 0.0001, "loss": 7.9427, "loss/crossentropy": 2.049947440624237, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.25083374232053757, "step": 2884 }, { "epoch": 0.180375, "grad_norm": 2.90625, "grad_norm_var": 0.10310872395833333, "learning_rate": 0.0001, "loss": 8.0033, "loss/crossentropy": 1.8752148747444153, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.22209620475769043, "step": 2886 }, { "epoch": 0.1805, "grad_norm": 4.625, "grad_norm_var": 0.20532938639322917, "learning_rate": 0.0001, "loss": 8.154, "loss/crossentropy": 2.293843626976013, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2605673372745514, "step": 2888 }, { "epoch": 0.180625, "grad_norm": 3.265625, "grad_norm_var": 0.19661458333333334, "learning_rate": 0.0001, "loss": 8.3813, "loss/crossentropy": 2.6502093076705933, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2919842600822449, "step": 2890 }, { "epoch": 0.18075, "grad_norm": 3.03125, "grad_norm_var": 0.19348856608072917, "learning_rate": 0.0001, "loss": 8.17, "loss/crossentropy": 2.2080347537994385, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2823172062635422, "step": 2892 }, { "epoch": 0.180875, "grad_norm": 3.09375, "grad_norm_var": 0.19078369140625, "learning_rate": 0.0001, "loss": 8.0838, "loss/crossentropy": 2.1315869092941284, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.22926902770996094, "step": 2894 }, { "epoch": 0.181, "grad_norm": 3.0625, "grad_norm_var": 0.19482014973958334, "learning_rate": 0.0001, "loss": 8.2607, "loss/crossentropy": 2.1946693658828735, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2978864014148712, "step": 2896 }, { "epoch": 0.181125, "grad_norm": 3.15625, "grad_norm_var": 0.16481831868489583, "learning_rate": 0.0001, "loss": 8.1676, "loss/crossentropy": 2.3941036462783813, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2867356091737747, "step": 2898 }, { "epoch": 0.18125, "grad_norm": 2.765625, "grad_norm_var": 0.17711181640625, "learning_rate": 0.0001, "loss": 8.018, "loss/crossentropy": 2.066552758216858, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.23682457953691483, "step": 2900 }, { "epoch": 0.181375, "grad_norm": 3.078125, "grad_norm_var": 0.1772857666015625, "learning_rate": 0.0001, "loss": 8.4728, "loss/crossentropy": 2.2247138023376465, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2622465193271637, "step": 2902 }, { "epoch": 0.1815, "grad_norm": 2.921875, "grad_norm_var": 0.022606404622395833, "learning_rate": 0.0001, "loss": 7.9567, "loss/crossentropy": 2.272329330444336, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2393846958875656, "step": 2904 }, { "epoch": 0.181625, "grad_norm": 3.265625, "grad_norm_var": 0.0299468994140625, "learning_rate": 0.0001, "loss": 8.4064, "loss/crossentropy": 2.1805957555770874, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3085392415523529, "step": 2906 }, { "epoch": 0.18175, "grad_norm": 2.9375, "grad_norm_var": 0.02437744140625, "learning_rate": 0.0001, "loss": 8.1843, "loss/crossentropy": 2.1811360120773315, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26569266617298126, "step": 2908 }, { "epoch": 0.181875, "grad_norm": 3.6875, "grad_norm_var": 0.05241597493489583, "learning_rate": 0.0001, "loss": 7.9974, "loss/crossentropy": 2.2962520122528076, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2718447148799896, "step": 2910 }, { "epoch": 0.182, "grad_norm": 2.96875, "grad_norm_var": 0.0521148681640625, "learning_rate": 0.0001, "loss": 8.2149, "loss/crossentropy": 2.4253257513046265, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2339843213558197, "step": 2912 }, { "epoch": 0.182125, "grad_norm": 2.984375, "grad_norm_var": 0.05153706868489583, "learning_rate": 0.0001, "loss": 7.9561, "loss/crossentropy": 2.0304603576660156, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.25152938812971115, "step": 2914 }, { "epoch": 0.18225, "grad_norm": 3.046875, "grad_norm_var": 0.043843587239583336, "learning_rate": 0.0001, "loss": 8.2177, "loss/crossentropy": 2.098052501678467, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26289620250463486, "step": 2916 }, { "epoch": 0.182375, "grad_norm": 3.375, "grad_norm_var": 0.04744364420572917, "learning_rate": 0.0001, "loss": 8.1297, "loss/crossentropy": 2.294631004333496, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24835306406021118, "step": 2918 }, { "epoch": 0.1825, "grad_norm": 2.75, "grad_norm_var": 0.0549957275390625, "learning_rate": 0.0001, "loss": 7.9393, "loss/crossentropy": 2.0778174996376038, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2518730163574219, "step": 2920 }, { "epoch": 0.182625, "grad_norm": 3.0625, "grad_norm_var": 0.0467437744140625, "learning_rate": 0.0001, "loss": 8.125, "loss/crossentropy": 2.165535807609558, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.25498564541339874, "step": 2922 }, { "epoch": 0.18275, "grad_norm": 3.21875, "grad_norm_var": 0.18338114420572918, "learning_rate": 0.0001, "loss": 8.2319, "loss/crossentropy": 2.3784775733947754, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.28745289146900177, "step": 2924 }, { "epoch": 0.182875, "grad_norm": 2.84375, "grad_norm_var": 0.16863505045572916, "learning_rate": 0.0001, "loss": 7.9546, "loss/crossentropy": 2.0629165172576904, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.23759926110506058, "step": 2926 }, { "epoch": 0.183, "grad_norm": 4.46875, "grad_norm_var": 0.27671610514322914, "learning_rate": 0.0001, "loss": 8.2899, "loss/crossentropy": 2.4927855730056763, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2830722779035568, "step": 2928 }, { "epoch": 0.183125, "grad_norm": 3.0, "grad_norm_var": 0.2729400634765625, "learning_rate": 0.0001, "loss": 8.4356, "loss/crossentropy": 2.415983200073242, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2693839967250824, "step": 2930 }, { "epoch": 0.18325, "grad_norm": 2.734375, "grad_norm_var": 0.28308003743489585, "learning_rate": 0.0001, "loss": 8.0816, "loss/crossentropy": 2.032285451889038, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26067546010017395, "step": 2932 }, { "epoch": 0.183375, "grad_norm": 2.78125, "grad_norm_var": 0.29550374348958336, "learning_rate": 0.0001, "loss": 7.8154, "loss/crossentropy": 2.1377962827682495, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.24648211151361465, "step": 2934 }, { "epoch": 0.1835, "grad_norm": 2.953125, "grad_norm_var": 0.28983968098958335, "learning_rate": 0.0001, "loss": 8.0299, "loss/crossentropy": 2.4560409784317017, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2714577168226242, "step": 2936 }, { "epoch": 0.183625, "grad_norm": 3.484375, "grad_norm_var": 0.29524332682291665, "learning_rate": 0.0001, "loss": 8.1931, "loss/crossentropy": 2.1783920526504517, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2481744885444641, "step": 2938 }, { "epoch": 0.18375, "grad_norm": 3.046875, "grad_norm_var": 0.17757059733072916, "learning_rate": 0.0001, "loss": 8.0857, "loss/crossentropy": 1.9973394274711609, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26759687066078186, "step": 2940 }, { "epoch": 0.183875, "grad_norm": 3.390625, "grad_norm_var": 0.17525634765625, "learning_rate": 0.0001, "loss": 8.0574, "loss/crossentropy": 2.3491625785827637, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2810981869697571, "step": 2942 }, { "epoch": 0.184, "grad_norm": 3.125, "grad_norm_var": 0.05301106770833333, "learning_rate": 0.0001, "loss": 7.9718, "loss/crossentropy": 2.3227654695510864, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.23687439411878586, "step": 2944 }, { "epoch": 0.184125, "grad_norm": 3.234375, "grad_norm_var": 0.052643839518229166, "learning_rate": 0.0001, "loss": 7.8879, "loss/crossentropy": 2.2228500843048096, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.23989352583885193, "step": 2946 }, { "epoch": 0.18425, "grad_norm": 6.0625, "grad_norm_var": 0.6635050455729167, "learning_rate": 0.0001, "loss": 8.2911, "loss/crossentropy": 2.0555625557899475, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2582997530698776, "step": 2948 }, { "epoch": 0.184375, "grad_norm": 3.03125, "grad_norm_var": 0.652490234375, "learning_rate": 0.0001, "loss": 8.3843, "loss/crossentropy": 2.608245849609375, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2860964983701706, "step": 2950 }, { "epoch": 0.1845, "grad_norm": 3.484375, "grad_norm_var": 0.6067667643229167, "learning_rate": 0.0001, "loss": 8.2006, "loss/crossentropy": 2.300239324569702, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.25120706856250763, "step": 2952 }, { "epoch": 0.184625, "grad_norm": 3.390625, "grad_norm_var": 0.6073476155598958, "learning_rate": 0.0001, "loss": 8.3779, "loss/crossentropy": 2.411876678466797, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2575942724943161, "step": 2954 }, { "epoch": 0.18475, "grad_norm": 3.203125, "grad_norm_var": 0.5947011311848959, "learning_rate": 0.0001, "loss": 8.2399, "loss/crossentropy": 2.376798152923584, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2712739408016205, "step": 2956 }, { "epoch": 0.184875, "grad_norm": 3.109375, "grad_norm_var": 0.6066365559895833, "learning_rate": 0.0001, "loss": 8.0585, "loss/crossentropy": 1.8379729390144348, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2462129294872284, "step": 2958 }, { "epoch": 0.185, "grad_norm": 2.78125, "grad_norm_var": 0.6259836832682292, "learning_rate": 0.0001, "loss": 8.1868, "loss/crossentropy": 2.0481252670288086, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26679444313049316, "step": 2960 }, { "epoch": 0.185125, "grad_norm": 3.0625, "grad_norm_var": 0.6304270426432291, "learning_rate": 0.0001, "loss": 7.9287, "loss/crossentropy": 2.2976279258728027, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24677569419145584, "step": 2962 }, { "epoch": 0.18525, "grad_norm": 2.90625, "grad_norm_var": 0.07884012858072917, "learning_rate": 0.0001, "loss": 8.2977, "loss/crossentropy": 2.3821330070495605, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2738860100507736, "step": 2964 }, { "epoch": 0.185375, "grad_norm": 3.25, "grad_norm_var": 0.0497711181640625, "learning_rate": 0.0001, "loss": 8.2062, "loss/crossentropy": 2.2890477180480957, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2848459780216217, "step": 2966 }, { "epoch": 0.1855, "grad_norm": 3.203125, "grad_norm_var": 0.06569722493489584, "learning_rate": 0.0001, "loss": 8.1331, "loss/crossentropy": 2.350267171859741, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29001131653785706, "step": 2968 }, { "epoch": 0.185625, "grad_norm": 2.953125, "grad_norm_var": 0.056376139322916664, "learning_rate": 0.0001, "loss": 8.2302, "loss/crossentropy": 2.1221266984939575, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24626632034778595, "step": 2970 }, { "epoch": 0.18575, "grad_norm": 2.828125, "grad_norm_var": 0.060700480143229166, "learning_rate": 0.0001, "loss": 8.0472, "loss/crossentropy": 2.237822413444519, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27403751015663147, "step": 2972 }, { "epoch": 0.185875, "grad_norm": 2.8125, "grad_norm_var": 0.06636962890625, "learning_rate": 0.0001, "loss": 8.0549, "loss/crossentropy": 2.0955153703689575, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26152363419532776, "step": 2974 }, { "epoch": 0.186, "grad_norm": 3.265625, "grad_norm_var": 0.06275634765625, "learning_rate": 0.0001, "loss": 8.0191, "loss/crossentropy": 2.1114585399627686, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.26552706956863403, "step": 2976 }, { "epoch": 0.186125, "grad_norm": 2.96875, "grad_norm_var": 0.10375874837239583, "learning_rate": 0.0001, "loss": 8.1071, "loss/crossentropy": 2.2367554903030396, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26158712804317474, "step": 2978 }, { "epoch": 0.18625, "grad_norm": 3.765625, "grad_norm_var": 0.13137919108072918, "learning_rate": 0.0001, "loss": 7.9746, "loss/crossentropy": 2.159699559211731, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.24696200340986252, "step": 2980 }, { "epoch": 0.186375, "grad_norm": 3.15625, "grad_norm_var": 0.12905985514322918, "learning_rate": 0.0001, "loss": 8.4136, "loss/crossentropy": 2.300874710083008, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.27589815855026245, "step": 2982 }, { "epoch": 0.1865, "grad_norm": 3.25, "grad_norm_var": 0.09980367024739584, "learning_rate": 0.0001, "loss": 7.9829, "loss/crossentropy": 2.128389060497284, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.24984990060329437, "step": 2984 }, { "epoch": 0.186625, "grad_norm": 2.96875, "grad_norm_var": 0.10515034993489583, "learning_rate": 0.0001, "loss": 8.0653, "loss/crossentropy": 2.1576287746429443, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.236042819917202, "step": 2986 }, { "epoch": 0.18675, "grad_norm": 2.78125, "grad_norm_var": 0.10535481770833334, "learning_rate": 0.0001, "loss": 8.0694, "loss/crossentropy": 2.099122941493988, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26695704460144043, "step": 2988 }, { "epoch": 0.186875, "grad_norm": 2.921875, "grad_norm_var": 0.10093994140625, "learning_rate": 0.0001, "loss": 8.2552, "loss/crossentropy": 2.1657750606536865, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.24663035571575165, "step": 2990 }, { "epoch": 0.187, "grad_norm": 3.109375, "grad_norm_var": 0.09954325358072917, "learning_rate": 0.0001, "loss": 8.2725, "loss/crossentropy": 2.5715911388397217, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2872152030467987, "step": 2992 }, { "epoch": 0.187125, "grad_norm": 3.078125, "grad_norm_var": 0.053023274739583334, "learning_rate": 0.0001, "loss": 8.1622, "loss/crossentropy": 2.0664035081863403, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2384256348013878, "step": 2994 }, { "epoch": 0.18725, "grad_norm": 2.796875, "grad_norm_var": 0.018843587239583334, "learning_rate": 0.0001, "loss": 7.7933, "loss/crossentropy": 2.0348451733589172, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2337305098772049, "step": 2996 }, { "epoch": 0.187375, "grad_norm": 3.25, "grad_norm_var": 0.022359212239583332, "learning_rate": 0.0001, "loss": 8.2447, "loss/crossentropy": 2.201599597930908, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.25810810923576355, "step": 2998 }, { "epoch": 0.1875, "grad_norm": 2.96875, "grad_norm_var": 0.0189849853515625, "learning_rate": 0.0001, "loss": 8.144, "loss/crossentropy": 1.942829668521881, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.22982265055179596, "step": 3000 }, { "epoch": 0.187625, "grad_norm": 4.21875, "grad_norm_var": 0.11073811848958333, "learning_rate": 0.0001, "loss": 8.0226, "loss/crossentropy": 2.0326908826828003, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2725354731082916, "step": 3002 }, { "epoch": 0.18775, "grad_norm": 3.640625, "grad_norm_var": 0.12427978515625, "learning_rate": 0.0001, "loss": 8.2052, "loss/crossentropy": 2.0383809208869934, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.3001396656036377, "step": 3004 }, { "epoch": 0.187875, "grad_norm": 3.171875, "grad_norm_var": 0.12063700358072917, "learning_rate": 0.0001, "loss": 8.2463, "loss/crossentropy": 2.172344207763672, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.27515046298503876, "step": 3006 }, { "epoch": 0.188, "grad_norm": 3.296875, "grad_norm_var": 0.1205718994140625, "learning_rate": 0.0001, "loss": 8.3859, "loss/crossentropy": 2.4539778232574463, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26322928071022034, "step": 3008 }, { "epoch": 0.188125, "grad_norm": 3.03125, "grad_norm_var": 0.12419331868489583, "learning_rate": 0.0001, "loss": 8.1055, "loss/crossentropy": 2.295140862464905, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.26453797519207, "step": 3010 }, { "epoch": 0.18825, "grad_norm": 2.953125, "grad_norm_var": 0.114697265625, "learning_rate": 0.0001, "loss": 7.9506, "loss/crossentropy": 2.2732503414154053, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.24187566339969635, "step": 3012 }, { "epoch": 0.188375, "grad_norm": 3.25, "grad_norm_var": 0.1217193603515625, "learning_rate": 0.0001, "loss": 7.7345, "loss/crossentropy": 2.3772653341293335, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2556615471839905, "step": 3014 }, { "epoch": 0.1885, "grad_norm": 3.03125, "grad_norm_var": 0.12017313639322917, "learning_rate": 0.0001, "loss": 8.0212, "loss/crossentropy": 2.0663068890571594, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24910389631986618, "step": 3016 }, { "epoch": 0.188625, "grad_norm": 3.46875, "grad_norm_var": 0.052164713541666664, "learning_rate": 0.0001, "loss": 8.1829, "loss/crossentropy": 2.0673555731773376, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2500082552433014, "step": 3018 }, { "epoch": 0.18875, "grad_norm": 2.890625, "grad_norm_var": 0.035498046875, "learning_rate": 0.0001, "loss": 8.0806, "loss/crossentropy": 2.283666253089905, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25972698628902435, "step": 3020 }, { "epoch": 0.188875, "grad_norm": 2.9375, "grad_norm_var": 0.035725911458333336, "learning_rate": 0.0001, "loss": 7.7757, "loss/crossentropy": 2.148744285106659, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2475430816411972, "step": 3022 }, { "epoch": 0.189, "grad_norm": 3.4375, "grad_norm_var": 0.04422200520833333, "learning_rate": 0.0001, "loss": 8.0354, "loss/crossentropy": 2.236390233039856, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2862044423818588, "step": 3024 }, { "epoch": 0.189125, "grad_norm": 2.953125, "grad_norm_var": 0.04456380208333333, "learning_rate": 0.0001, "loss": 7.9881, "loss/crossentropy": 2.128193736076355, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24634715169668198, "step": 3026 }, { "epoch": 0.18925, "grad_norm": 2.890625, "grad_norm_var": 0.04996337890625, "learning_rate": 0.0001, "loss": 8.0476, "loss/crossentropy": 2.1916226148605347, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23787105828523636, "step": 3028 }, { "epoch": 0.189375, "grad_norm": 2.8125, "grad_norm_var": 0.045149739583333334, "learning_rate": 0.0001, "loss": 7.9717, "loss/crossentropy": 2.012889325618744, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2314399853348732, "step": 3030 }, { "epoch": 0.1895, "grad_norm": 3.078125, "grad_norm_var": 0.09358723958333333, "learning_rate": 0.0001, "loss": 7.9838, "loss/crossentropy": 1.9517049193382263, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.23443001508712769, "step": 3032 }, { "epoch": 0.189625, "grad_norm": 3.671875, "grad_norm_var": 0.1109527587890625, "learning_rate": 0.0001, "loss": 7.8672, "loss/crossentropy": 2.0745259523391724, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24954179674386978, "step": 3034 }, { "epoch": 0.18975, "grad_norm": 8.3125, "grad_norm_var": 1.8283843994140625, "learning_rate": 0.0001, "loss": 8.4367, "loss/crossentropy": 1.996957778930664, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2448180690407753, "step": 3036 }, { "epoch": 0.189875, "grad_norm": 2.953125, "grad_norm_var": 1.8130818684895833, "learning_rate": 0.0001, "loss": 8.1351, "loss/crossentropy": 1.9828272461891174, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.26171596348285675, "step": 3038 }, { "epoch": 0.19, "grad_norm": 3.171875, "grad_norm_var": 1.7945220947265625, "learning_rate": 0.0001, "loss": 8.4596, "loss/crossentropy": 2.505177140235901, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27588650584220886, "step": 3040 }, { "epoch": 0.190125, "grad_norm": 3.34375, "grad_norm_var": 1.7588043212890625, "learning_rate": 0.0001, "loss": 8.4254, "loss/crossentropy": 2.637117862701416, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.26090574264526367, "step": 3042 }, { "epoch": 0.19025, "grad_norm": 3.328125, "grad_norm_var": 1.7012044270833333, "learning_rate": 0.0001, "loss": 8.2038, "loss/crossentropy": 2.3125483989715576, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2760762870311737, "step": 3044 }, { "epoch": 0.190375, "grad_norm": 3.328125, "grad_norm_var": 1.6323557535807292, "learning_rate": 0.0001, "loss": 8.4245, "loss/crossentropy": 2.215254306793213, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.26608458161354065, "step": 3046 }, { "epoch": 0.1905, "grad_norm": 3.25, "grad_norm_var": 1.6180948893229166, "learning_rate": 0.0001, "loss": 8.1121, "loss/crossentropy": 2.02052503824234, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26287131011486053, "step": 3048 }, { "epoch": 0.190625, "grad_norm": 3.015625, "grad_norm_var": 1.6158274332682292, "learning_rate": 0.0001, "loss": 8.0364, "loss/crossentropy": 2.1065566539764404, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2549958974123001, "step": 3050 }, { "epoch": 0.19075, "grad_norm": 3.015625, "grad_norm_var": 0.04055582682291667, "learning_rate": 0.0001, "loss": 8.1742, "loss/crossentropy": 2.0868913531303406, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2490181252360344, "step": 3052 }, { "epoch": 0.190875, "grad_norm": 3.15625, "grad_norm_var": 0.03693033854166667, "learning_rate": 0.0001, "loss": 8.137, "loss/crossentropy": 2.562083125114441, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2669790983200073, "step": 3054 }, { "epoch": 0.191, "grad_norm": 3.21875, "grad_norm_var": 0.035628255208333334, "learning_rate": 0.0001, "loss": 7.8852, "loss/crossentropy": 1.9702489972114563, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2521437928080559, "step": 3056 }, { "epoch": 0.191125, "grad_norm": 5.8125, "grad_norm_var": 0.4646392822265625, "learning_rate": 0.0001, "loss": 8.2322, "loss/crossentropy": 2.1872304677963257, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2451140657067299, "step": 3058 }, { "epoch": 0.19125, "grad_norm": 3.140625, "grad_norm_var": 0.46822509765625, "learning_rate": 0.0001, "loss": 8.3641, "loss/crossentropy": 2.0753592252731323, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.23564471304416656, "step": 3060 }, { "epoch": 0.191375, "grad_norm": 3.09375, "grad_norm_var": 0.48017578125, "learning_rate": 0.0001, "loss": 7.9076, "loss/crossentropy": 1.8795402646064758, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.23920582979917526, "step": 3062 }, { "epoch": 0.1915, "grad_norm": 3.109375, "grad_norm_var": 0.48013916015625, "learning_rate": 0.0001, "loss": 8.2125, "loss/crossentropy": 1.9972794651985168, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29848983883857727, "step": 3064 }, { "epoch": 0.191625, "grad_norm": 3.015625, "grad_norm_var": 0.4837890625, "learning_rate": 0.0001, "loss": 8.0707, "loss/crossentropy": 2.3735026121139526, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25427188724279404, "step": 3066 }, { "epoch": 0.19175, "grad_norm": 2.96875, "grad_norm_var": 0.4855377197265625, "learning_rate": 0.0001, "loss": 8.2161, "loss/crossentropy": 2.264596104621887, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2805734723806381, "step": 3068 }, { "epoch": 0.191875, "grad_norm": 3.109375, "grad_norm_var": 0.484228515625, "learning_rate": 0.0001, "loss": 8.2559, "loss/crossentropy": 2.0113645792007446, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2517086789011955, "step": 3070 }, { "epoch": 0.192, "grad_norm": 2.9375, "grad_norm_var": 0.5784820556640625, "learning_rate": 0.0001, "loss": 8.2926, "loss/crossentropy": 2.363813877105713, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28991248458623886, "step": 3072 }, { "epoch": 0.192125, "grad_norm": 3.171875, "grad_norm_var": 0.21458333333333332, "learning_rate": 0.0001, "loss": 8.411, "loss/crossentropy": 2.4118860960006714, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2831159234046936, "step": 3074 }, { "epoch": 0.19225, "grad_norm": 2.96875, "grad_norm_var": 0.21534830729166668, "learning_rate": 0.0001, "loss": 8.2235, "loss/crossentropy": 2.082840859889984, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2412284016609192, "step": 3076 }, { "epoch": 0.192375, "grad_norm": 2.734375, "grad_norm_var": 0.23146158854166668, "learning_rate": 0.0001, "loss": 8.0682, "loss/crossentropy": 2.2923243045806885, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2539145052433014, "step": 3078 }, { "epoch": 0.1925, "grad_norm": 2.859375, "grad_norm_var": 0.24404296875, "learning_rate": 0.0001, "loss": 8.0316, "loss/crossentropy": 2.2611557245254517, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24866493791341782, "step": 3080 }, { "epoch": 0.192625, "grad_norm": 3.0, "grad_norm_var": 0.24135640462239583, "learning_rate": 0.0001, "loss": 8.3842, "loss/crossentropy": 2.1011396646499634, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2922767102718353, "step": 3082 }, { "epoch": 0.19275, "grad_norm": 3.234375, "grad_norm_var": 0.23547770182291666, "learning_rate": 0.0001, "loss": 8.2303, "loss/crossentropy": 2.256028175354004, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.28726726770401, "step": 3084 }, { "epoch": 0.192875, "grad_norm": 2.953125, "grad_norm_var": 0.23726806640625, "learning_rate": 0.0001, "loss": 8.1028, "loss/crossentropy": 2.3124324083328247, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.28199779987335205, "step": 3086 }, { "epoch": 0.193, "grad_norm": 2.90625, "grad_norm_var": 0.12746480305989583, "learning_rate": 0.0001, "loss": 7.9293, "loss/crossentropy": 2.095321834087372, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2518906518816948, "step": 3088 }, { "epoch": 0.193125, "grad_norm": 2.796875, "grad_norm_var": 0.0335113525390625, "learning_rate": 0.0001, "loss": 7.9163, "loss/crossentropy": 2.1881635189056396, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2385866716504097, "step": 3090 }, { "epoch": 0.19325, "grad_norm": 3.09375, "grad_norm_var": 0.026325480143229166, "learning_rate": 0.0001, "loss": 7.9345, "loss/crossentropy": 2.1771040558815002, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25531386584043503, "step": 3092 }, { "epoch": 0.193375, "grad_norm": 3.0, "grad_norm_var": 0.04688212076822917, "learning_rate": 0.0001, "loss": 8.1576, "loss/crossentropy": 2.213144898414612, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27239470183849335, "step": 3094 }, { "epoch": 0.1935, "grad_norm": 2.96875, "grad_norm_var": 0.045166015625, "learning_rate": 0.0001, "loss": 8.0865, "loss/crossentropy": 2.0957645177841187, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2644564360380173, "step": 3096 }, { "epoch": 0.193625, "grad_norm": 3.109375, "grad_norm_var": 0.04537353515625, "learning_rate": 0.0001, "loss": 8.0834, "loss/crossentropy": 2.4782612323760986, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24987854063510895, "step": 3098 }, { "epoch": 0.19375, "grad_norm": 3.09375, "grad_norm_var": 0.04449462890625, "learning_rate": 0.0001, "loss": 8.1192, "loss/crossentropy": 2.154414653778076, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24988670647144318, "step": 3100 }, { "epoch": 0.193875, "grad_norm": 2.875, "grad_norm_var": 0.0470611572265625, "learning_rate": 0.0001, "loss": 7.7171, "loss/crossentropy": 1.8572211861610413, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.24098588526248932, "step": 3102 }, { "epoch": 0.194, "grad_norm": 2.859375, "grad_norm_var": 0.0474273681640625, "learning_rate": 0.0001, "loss": 7.9725, "loss/crossentropy": 2.1595897674560547, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.23436659574508667, "step": 3104 }, { "epoch": 0.194125, "grad_norm": 2.921875, "grad_norm_var": 0.043538411458333336, "learning_rate": 0.0001, "loss": 8.2479, "loss/crossentropy": 2.3742247819900513, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24041110277175903, "step": 3106 }, { "epoch": 0.19425, "grad_norm": 3.171875, "grad_norm_var": 0.04283447265625, "learning_rate": 0.0001, "loss": 8.4474, "loss/crossentropy": 2.2766979932785034, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.28190480172634125, "step": 3108 }, { "epoch": 0.194375, "grad_norm": 2.8125, "grad_norm_var": 0.01871337890625, "learning_rate": 0.0001, "loss": 7.935, "loss/crossentropy": 2.0490583181381226, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.23022068291902542, "step": 3110 }, { "epoch": 0.1945, "grad_norm": 3.21875, "grad_norm_var": 0.022557576497395832, "learning_rate": 0.0001, "loss": 7.999, "loss/crossentropy": 2.0352837443351746, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27862748503685, "step": 3112 }, { "epoch": 0.194625, "grad_norm": 2.828125, "grad_norm_var": 0.022412109375, "learning_rate": 0.0001, "loss": 7.8593, "loss/crossentropy": 2.4907076358795166, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.24378710985183716, "step": 3114 }, { "epoch": 0.19475, "grad_norm": 3.671875, "grad_norm_var": 2.1835927327473956, "learning_rate": 0.0001, "loss": 8.505, "loss/crossentropy": 2.4156166315078735, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2769237756729126, "step": 3116 }, { "epoch": 0.194875, "grad_norm": 3.015625, "grad_norm_var": 2.158885701497396, "learning_rate": 0.0001, "loss": 8.1386, "loss/crossentropy": 2.2743349075317383, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29171325266361237, "step": 3118 }, { "epoch": 0.195, "grad_norm": 3.3125, "grad_norm_var": 2.132373046875, "learning_rate": 0.0001, "loss": 8.2431, "loss/crossentropy": 2.1338940858840942, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.24566402286291122, "step": 3120 }, { "epoch": 0.195125, "grad_norm": 3.34375, "grad_norm_var": 2.09088134765625, "learning_rate": 0.0001, "loss": 8.1374, "loss/crossentropy": 2.0946673154830933, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.24040228873491287, "step": 3122 }, { "epoch": 0.19525, "grad_norm": 3.078125, "grad_norm_var": 2.1053782145182294, "learning_rate": 0.0001, "loss": 8.1047, "loss/crossentropy": 2.0649890899658203, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27136728167533875, "step": 3124 }, { "epoch": 0.195375, "grad_norm": 3.15625, "grad_norm_var": 2.073583984375, "learning_rate": 0.0001, "loss": 8.1862, "loss/crossentropy": 2.20087730884552, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2605709135532379, "step": 3126 }, { "epoch": 0.1955, "grad_norm": 3.390625, "grad_norm_var": 5.774339803059896, "learning_rate": 0.0001, "loss": 8.8949, "loss/crossentropy": 2.463629126548767, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.48454609513282776, "step": 3128 }, { "epoch": 0.195625, "grad_norm": 2.828125, "grad_norm_var": 5.744204711914063, "learning_rate": 0.0001, "loss": 8.0592, "loss/crossentropy": 2.24616539478302, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2649071663618088, "step": 3130 }, { "epoch": 0.19575, "grad_norm": 2.75, "grad_norm_var": 4.2225901285807295, "learning_rate": 0.0001, "loss": 7.9637, "loss/crossentropy": 2.031158745288849, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.23581355065107346, "step": 3132 }, { "epoch": 0.195875, "grad_norm": 3.09375, "grad_norm_var": 4.219026692708334, "learning_rate": 0.0001, "loss": 8.0351, "loss/crossentropy": 2.0583357214927673, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.23669998347759247, "step": 3134 }, { "epoch": 0.196, "grad_norm": 2.875, "grad_norm_var": 4.242203776041666, "learning_rate": 0.0001, "loss": 8.0279, "loss/crossentropy": 2.1241250038146973, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2675621211528778, "step": 3136 }, { "epoch": 0.196125, "grad_norm": 3.0, "grad_norm_var": 4.28258056640625, "learning_rate": 0.0001, "loss": 7.9089, "loss/crossentropy": 2.201973795890808, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24000436812639236, "step": 3138 }, { "epoch": 0.19625, "grad_norm": 3.125, "grad_norm_var": 4.28941650390625, "learning_rate": 0.0001, "loss": 7.8074, "loss/crossentropy": 2.3027167320251465, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24793165177106857, "step": 3140 }, { "epoch": 0.196375, "grad_norm": 2.671875, "grad_norm_var": 4.338101196289062, "learning_rate": 0.0001, "loss": 7.9764, "loss/crossentropy": 2.252092719078064, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2759167104959488, "step": 3142 }, { "epoch": 0.1965, "grad_norm": 2.828125, "grad_norm_var": 0.020311482747395835, "learning_rate": 0.0001, "loss": 8.2503, "loss/crossentropy": 2.3070385456085205, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2729652523994446, "step": 3144 }, { "epoch": 0.196625, "grad_norm": 3.53125, "grad_norm_var": 0.042952473958333334, "learning_rate": 0.0001, "loss": 8.0925, "loss/crossentropy": 2.074267029762268, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.29726506769657135, "step": 3146 }, { "epoch": 0.19675, "grad_norm": 2.921875, "grad_norm_var": 0.03704020182291667, "learning_rate": 0.0001, "loss": 7.9328, "loss/crossentropy": 1.9039493799209595, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2604455202817917, "step": 3148 }, { "epoch": 0.196875, "grad_norm": 3.125, "grad_norm_var": 0.07862955729166667, "learning_rate": 0.0001, "loss": 8.0542, "loss/crossentropy": 2.2672078609466553, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.25014375895261765, "step": 3150 }, { "epoch": 0.197, "grad_norm": 5.40625, "grad_norm_var": 0.44168192545572915, "learning_rate": 0.0001, "loss": 8.177, "loss/crossentropy": 2.244156837463379, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2693129777908325, "step": 3152 }, { "epoch": 0.197125, "grad_norm": 2.96875, "grad_norm_var": 0.4394195556640625, "learning_rate": 0.0001, "loss": 8.011, "loss/crossentropy": 2.282211124897003, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.246182382106781, "step": 3154 }, { "epoch": 0.19725, "grad_norm": 2.78125, "grad_norm_var": 0.44788004557291666, "learning_rate": 0.0001, "loss": 8.133, "loss/crossentropy": 2.4384297132492065, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2920108437538147, "step": 3156 }, { "epoch": 0.197375, "grad_norm": 2.96875, "grad_norm_var": 0.4283111572265625, "learning_rate": 0.0001, "loss": 8.4777, "loss/crossentropy": 2.217103123664856, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26621317863464355, "step": 3158 }, { "epoch": 0.1975, "grad_norm": 3.09375, "grad_norm_var": 0.42473042805989586, "learning_rate": 0.0001, "loss": 8.0046, "loss/crossentropy": 2.287508726119995, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26456665992736816, "step": 3160 }, { "epoch": 0.197625, "grad_norm": 2.859375, "grad_norm_var": 0.41472066243489586, "learning_rate": 0.0001, "loss": 7.9413, "loss/crossentropy": 2.2249319553375244, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2526981383562088, "step": 3162 }, { "epoch": 0.19775, "grad_norm": 2.84375, "grad_norm_var": 0.4169230143229167, "learning_rate": 0.0001, "loss": 8.2529, "loss/crossentropy": 2.640749454498291, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2600443512201309, "step": 3164 }, { "epoch": 0.197875, "grad_norm": 2.859375, "grad_norm_var": 0.4012115478515625, "learning_rate": 0.0001, "loss": 8.2345, "loss/crossentropy": 2.553953766822815, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24152962118387222, "step": 3166 }, { "epoch": 0.198, "grad_norm": 2.78125, "grad_norm_var": 0.034891764322916664, "learning_rate": 0.0001, "loss": 8.0211, "loss/crossentropy": 2.274572491645813, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2605311721563339, "step": 3168 }, { "epoch": 0.198125, "grad_norm": 2.890625, "grad_norm_var": 0.5576405843098958, "learning_rate": 0.0001, "loss": 8.2881, "loss/crossentropy": 2.349919319152832, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2703807130455971, "step": 3170 }, { "epoch": 0.19825, "grad_norm": 3.03125, "grad_norm_var": 0.5448394775390625, "learning_rate": 0.0001, "loss": 8.1929, "loss/crossentropy": 2.343743681907654, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2511185109615326, "step": 3172 }, { "epoch": 0.198375, "grad_norm": 3.171875, "grad_norm_var": 0.5429433186848959, "learning_rate": 0.0001, "loss": 8.0174, "loss/crossentropy": 2.16545033454895, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23546815663576126, "step": 3174 }, { "epoch": 0.1985, "grad_norm": 2.90625, "grad_norm_var": 0.5407063802083333, "learning_rate": 0.0001, "loss": 7.9079, "loss/crossentropy": 2.1489791870117188, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24025410413742065, "step": 3176 }, { "epoch": 0.198625, "grad_norm": 3.125, "grad_norm_var": 0.5433746337890625, "learning_rate": 0.0001, "loss": 7.9213, "loss/crossentropy": 2.169691503047943, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2868233025074005, "step": 3178 }, { "epoch": 0.19875, "grad_norm": 2.921875, "grad_norm_var": 0.5425608317057292, "learning_rate": 0.0001, "loss": 8.0195, "loss/crossentropy": 2.5110554695129395, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2574908062815666, "step": 3180 }, { "epoch": 0.198875, "grad_norm": 2.6875, "grad_norm_var": 0.54677734375, "learning_rate": 0.0001, "loss": 8.1129, "loss/crossentropy": 2.3340593576431274, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.30013228207826614, "step": 3182 }, { "epoch": 0.199, "grad_norm": 3.09375, "grad_norm_var": 0.5442667643229167, "learning_rate": 0.0001, "loss": 7.9582, "loss/crossentropy": 2.0756776332855225, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2436860129237175, "step": 3184 }, { "epoch": 0.199125, "grad_norm": 5.5625, "grad_norm_var": 0.44795633951822916, "learning_rate": 0.0001, "loss": 8.2122, "loss/crossentropy": 2.113327741622925, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.20469839870929718, "step": 3186 }, { "epoch": 0.19925, "grad_norm": 3.28125, "grad_norm_var": 0.4516998291015625, "learning_rate": 0.0001, "loss": 8.2074, "loss/crossentropy": 2.5936840772628784, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28213709592819214, "step": 3188 }, { "epoch": 0.199375, "grad_norm": 3.171875, "grad_norm_var": 0.4600494384765625, "learning_rate": 0.0001, "loss": 7.7539, "loss/crossentropy": 2.1763676404953003, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.21971163898706436, "step": 3190 }, { "epoch": 0.1995, "grad_norm": 3.15625, "grad_norm_var": 0.4564615885416667, "learning_rate": 0.0001, "loss": 7.9865, "loss/crossentropy": 2.244124174118042, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2699843645095825, "step": 3192 }, { "epoch": 0.199625, "grad_norm": 3.140625, "grad_norm_var": 0.4501617431640625, "learning_rate": 0.0001, "loss": 8.1955, "loss/crossentropy": 2.690917730331421, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2602824419736862, "step": 3194 }, { "epoch": 0.19975, "grad_norm": 2.8125, "grad_norm_var": 0.4597981770833333, "learning_rate": 0.0001, "loss": 7.954, "loss/crossentropy": 1.9736242294311523, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24549099057912827, "step": 3196 }, { "epoch": 0.199875, "grad_norm": 2.765625, "grad_norm_var": 0.4725087483723958, "learning_rate": 0.0001, "loss": 7.9545, "loss/crossentropy": 2.103693187236786, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24165673553943634, "step": 3198 }, { "epoch": 0.2, "grad_norm": 3.140625, "grad_norm_var": 0.46343994140625, "learning_rate": 0.0001, "loss": 8.0753, "loss/crossentropy": 2.3715012073516846, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2271784022450447, "step": 3200 }, { "epoch": 0.200125, "grad_norm": 2.546875, "grad_norm_var": 0.092333984375, "learning_rate": 0.0001, "loss": 7.7709, "loss/crossentropy": 1.905119776725769, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2449379563331604, "step": 3202 }, { "epoch": 0.20025, "grad_norm": 2.8125, "grad_norm_var": 0.08449605305989584, "learning_rate": 0.0001, "loss": 7.7444, "loss/crossentropy": 2.426528573036194, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.278030663728714, "step": 3204 }, { "epoch": 0.200375, "grad_norm": 2.96875, "grad_norm_var": 0.0943267822265625, "learning_rate": 0.0001, "loss": 8.1623, "loss/crossentropy": 2.6639328002929688, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2694458067417145, "step": 3206 }, { "epoch": 0.2005, "grad_norm": 2.8125, "grad_norm_var": 0.09638264973958334, "learning_rate": 0.0001, "loss": 7.9942, "loss/crossentropy": 2.206454634666443, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.246596060693264, "step": 3208 }, { "epoch": 0.200625, "grad_norm": 2.90625, "grad_norm_var": 0.09921468098958333, "learning_rate": 0.0001, "loss": 8.0576, "loss/crossentropy": 2.4695513248443604, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2506033331155777, "step": 3210 }, { "epoch": 0.20075, "grad_norm": 3.203125, "grad_norm_var": 0.10899149576822917, "learning_rate": 0.0001, "loss": 8.1659, "loss/crossentropy": 2.414578676223755, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2819489985704422, "step": 3212 }, { "epoch": 0.200875, "grad_norm": 2.828125, "grad_norm_var": 0.07192281087239584, "learning_rate": 0.0001, "loss": 7.9123, "loss/crossentropy": 2.0450902581214905, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2466270476579666, "step": 3214 }, { "epoch": 0.201, "grad_norm": 3.0, "grad_norm_var": 0.06767578125, "learning_rate": 0.0001, "loss": 7.9173, "loss/crossentropy": 2.198870837688446, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.25618264079093933, "step": 3216 }, { "epoch": 0.201125, "grad_norm": 3.5, "grad_norm_var": 0.06614176432291667, "learning_rate": 0.0001, "loss": 7.9371, "loss/crossentropy": 1.9925526976585388, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.23090589046478271, "step": 3218 }, { "epoch": 0.20125, "grad_norm": 3.203125, "grad_norm_var": 0.060302734375, "learning_rate": 0.0001, "loss": 8.0432, "loss/crossentropy": 2.6509182453155518, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.23189008235931396, "step": 3220 }, { "epoch": 0.201375, "grad_norm": 3.0625, "grad_norm_var": 0.0681060791015625, "learning_rate": 0.0001, "loss": 7.736, "loss/crossentropy": 1.9328733682632446, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2312452644109726, "step": 3222 }, { "epoch": 0.2015, "grad_norm": 2.75, "grad_norm_var": 0.06734619140625, "learning_rate": 0.0001, "loss": 8.1442, "loss/crossentropy": 2.3680273294448853, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2512563616037369, "step": 3224 }, { "epoch": 0.201625, "grad_norm": 3.765625, "grad_norm_var": 0.3245920817057292, "learning_rate": 0.0001, "loss": 8.2189, "loss/crossentropy": 2.1417096853256226, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.24675866961479187, "step": 3226 }, { "epoch": 0.20175, "grad_norm": 2.890625, "grad_norm_var": 0.3329386393229167, "learning_rate": 0.0001, "loss": 8.2001, "loss/crossentropy": 2.240624785423279, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24948646873235703, "step": 3228 }, { "epoch": 0.201875, "grad_norm": 2.984375, "grad_norm_var": 0.3274892171223958, "learning_rate": 0.0001, "loss": 8.0371, "loss/crossentropy": 2.249924659729004, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24714338779449463, "step": 3230 }, { "epoch": 0.202, "grad_norm": 2.828125, "grad_norm_var": 0.3283162434895833, "learning_rate": 0.0001, "loss": 8.0393, "loss/crossentropy": 2.1449814438819885, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.23471366614103317, "step": 3232 }, { "epoch": 0.202125, "grad_norm": 3.171875, "grad_norm_var": 0.32779032389322915, "learning_rate": 0.0001, "loss": 8.2906, "loss/crossentropy": 2.2335373163223267, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2805033326148987, "step": 3234 }, { "epoch": 0.20225, "grad_norm": 2.765625, "grad_norm_var": 0.3413899739583333, "learning_rate": 0.0001, "loss": 7.9459, "loss/crossentropy": 2.3133461475372314, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2567446604371071, "step": 3236 }, { "epoch": 0.202375, "grad_norm": 2.984375, "grad_norm_var": 0.3323893229166667, "learning_rate": 0.0001, "loss": 7.8241, "loss/crossentropy": 2.2659177780151367, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.23788409680128098, "step": 3238 }, { "epoch": 0.2025, "grad_norm": 3.046875, "grad_norm_var": 0.3299153645833333, "learning_rate": 0.0001, "loss": 7.8355, "loss/crossentropy": 1.9051202535629272, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2412119358778, "step": 3240 }, { "epoch": 0.202625, "grad_norm": 3.203125, "grad_norm_var": 0.032746378580729166, "learning_rate": 0.0001, "loss": 7.8876, "loss/crossentropy": 2.2133265137672424, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26755909621715546, "step": 3242 }, { "epoch": 0.20275, "grad_norm": 2.546875, "grad_norm_var": 0.036702473958333336, "learning_rate": 0.0001, "loss": 7.8017, "loss/crossentropy": 2.314054846763611, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2537431865930557, "step": 3244 }, { "epoch": 0.202875, "grad_norm": 3.15625, "grad_norm_var": 0.058934529622395836, "learning_rate": 0.0001, "loss": 8.203, "loss/crossentropy": 2.364421248435974, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2737329602241516, "step": 3246 }, { "epoch": 0.203, "grad_norm": 2.984375, "grad_norm_var": 0.0512847900390625, "learning_rate": 0.0001, "loss": 8.1269, "loss/crossentropy": 2.0864169001579285, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23171114921569824, "step": 3248 }, { "epoch": 0.203125, "grad_norm": 2.84375, "grad_norm_var": 0.0517974853515625, "learning_rate": 0.0001, "loss": 7.9114, "loss/crossentropy": 2.1427239179611206, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.23028402775526047, "step": 3250 }, { "epoch": 0.20325, "grad_norm": 2.75, "grad_norm_var": 0.06773681640625, "learning_rate": 0.0001, "loss": 8.1254, "loss/crossentropy": 2.391860604286194, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28171634674072266, "step": 3252 }, { "epoch": 0.203375, "grad_norm": 3.015625, "grad_norm_var": 0.07106119791666667, "learning_rate": 0.0001, "loss": 8.0575, "loss/crossentropy": 1.9966859817504883, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.25942039489746094, "step": 3254 }, { "epoch": 0.2035, "grad_norm": 3.09375, "grad_norm_var": 0.11903889973958333, "learning_rate": 0.0001, "loss": 8.2398, "loss/crossentropy": 2.604487895965576, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.33525606989860535, "step": 3256 }, { "epoch": 0.203625, "grad_norm": 2.921875, "grad_norm_var": 0.1185211181640625, "learning_rate": 0.0001, "loss": 8.2011, "loss/crossentropy": 2.1014347076416016, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.24271199107170105, "step": 3258 }, { "epoch": 0.20375, "grad_norm": 3.125, "grad_norm_var": 0.0933258056640625, "learning_rate": 0.0001, "loss": 8.1381, "loss/crossentropy": 2.405876874923706, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2620885819196701, "step": 3260 }, { "epoch": 0.203875, "grad_norm": 3.25, "grad_norm_var": 0.225341796875, "learning_rate": 0.0001, "loss": 8.381, "loss/crossentropy": 2.3584976196289062, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.27236026525497437, "step": 3262 }, { "epoch": 0.204, "grad_norm": 2.859375, "grad_norm_var": 0.2334625244140625, "learning_rate": 0.0001, "loss": 8.0094, "loss/crossentropy": 2.3013174533843994, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2458626553416252, "step": 3264 }, { "epoch": 0.204125, "grad_norm": 2.875, "grad_norm_var": 0.2292388916015625, "learning_rate": 0.0001, "loss": 7.9233, "loss/crossentropy": 1.8906287550926208, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2380588874220848, "step": 3266 }, { "epoch": 0.20425, "grad_norm": 3.0625, "grad_norm_var": 0.21341044108072918, "learning_rate": 0.0001, "loss": 7.9147, "loss/crossentropy": 2.2484216690063477, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2623318284749985, "step": 3268 }, { "epoch": 0.204375, "grad_norm": 3.71875, "grad_norm_var": 0.23638916015625, "learning_rate": 0.0001, "loss": 8.2191, "loss/crossentropy": 2.299185633659363, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.24296559393405914, "step": 3270 }, { "epoch": 0.2045, "grad_norm": 3.375, "grad_norm_var": 0.20302734375, "learning_rate": 0.0001, "loss": 8.0814, "loss/crossentropy": 2.173090398311615, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.21140458434820175, "step": 3272 }, { "epoch": 0.204625, "grad_norm": 2.84375, "grad_norm_var": 0.20867513020833334, "learning_rate": 0.0001, "loss": 7.8821, "loss/crossentropy": 2.3034706115722656, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26016516983509064, "step": 3274 }, { "epoch": 0.20475, "grad_norm": 3.0625, "grad_norm_var": 0.21607157389322917, "learning_rate": 0.0001, "loss": 7.8581, "loss/crossentropy": 2.073949694633484, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.22817599028348923, "step": 3276 }, { "epoch": 0.204875, "grad_norm": 2.796875, "grad_norm_var": 0.06314188639322917, "learning_rate": 0.0001, "loss": 8.0344, "loss/crossentropy": 2.3152432441711426, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25035272538661957, "step": 3278 }, { "epoch": 0.205, "grad_norm": 2.890625, "grad_norm_var": 0.06655171712239584, "learning_rate": 0.0001, "loss": 8.0313, "loss/crossentropy": 2.168930411338806, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23998095095157623, "step": 3280 }, { "epoch": 0.205125, "grad_norm": 2.671875, "grad_norm_var": 0.07229410807291667, "learning_rate": 0.0001, "loss": 7.743, "loss/crossentropy": 2.0424435138702393, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.21602368354797363, "step": 3282 }, { "epoch": 0.20525, "grad_norm": 2.90625, "grad_norm_var": 0.08011067708333333, "learning_rate": 0.0001, "loss": 7.957, "loss/crossentropy": 2.223048210144043, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25354528427124023, "step": 3284 }, { "epoch": 0.205375, "grad_norm": 2.8125, "grad_norm_var": 0.04046223958333333, "learning_rate": 0.0001, "loss": 8.2095, "loss/crossentropy": 2.1094924211502075, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24489177763462067, "step": 3286 }, { "epoch": 0.2055, "grad_norm": 3.0, "grad_norm_var": 0.028490193684895835, "learning_rate": 0.0001, "loss": 8.2185, "loss/crossentropy": 2.0988100171089172, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2616979777812958, "step": 3288 }, { "epoch": 0.205625, "grad_norm": 2.953125, "grad_norm_var": 0.029442342122395833, "learning_rate": 0.0001, "loss": 8.2005, "loss/crossentropy": 2.4190319776535034, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.23693818598985672, "step": 3290 }, { "epoch": 0.20575, "grad_norm": 3.0625, "grad_norm_var": 0.031493123372395834, "learning_rate": 0.0001, "loss": 8.1089, "loss/crossentropy": 2.1579580903053284, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26754793524742126, "step": 3292 }, { "epoch": 0.205875, "grad_norm": 2.890625, "grad_norm_var": 0.032380167643229166, "learning_rate": 0.0001, "loss": 8.1828, "loss/crossentropy": 2.5043996572494507, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2770374268293381, "step": 3294 }, { "epoch": 0.206, "grad_norm": 3.234375, "grad_norm_var": 0.033251953125, "learning_rate": 0.0001, "loss": 8.0759, "loss/crossentropy": 2.2610336542129517, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.25794391334056854, "step": 3296 }, { "epoch": 0.206125, "grad_norm": 3.46875, "grad_norm_var": 0.0505523681640625, "learning_rate": 0.0001, "loss": 8.4392, "loss/crossentropy": 2.3251298666000366, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.25523504614830017, "step": 3298 }, { "epoch": 0.20625, "grad_norm": 3.203125, "grad_norm_var": 0.03577067057291667, "learning_rate": 0.0001, "loss": 8.3786, "loss/crossentropy": 2.4848973751068115, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28936903178691864, "step": 3300 }, { "epoch": 0.206375, "grad_norm": 2.90625, "grad_norm_var": 0.044367472330729164, "learning_rate": 0.0001, "loss": 7.8645, "loss/crossentropy": 2.0681798458099365, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.22367826104164124, "step": 3302 }, { "epoch": 0.2065, "grad_norm": 3.0, "grad_norm_var": 0.04277242024739583, "learning_rate": 0.0001, "loss": 7.9294, "loss/crossentropy": 2.2913665175437927, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2563953995704651, "step": 3304 }, { "epoch": 0.206625, "grad_norm": 3.359375, "grad_norm_var": 0.049117024739583334, "learning_rate": 0.0001, "loss": 8.0841, "loss/crossentropy": 2.156054139137268, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.29133178293704987, "step": 3306 }, { "epoch": 0.20675, "grad_norm": 2.828125, "grad_norm_var": 0.05422261555989583, "learning_rate": 0.0001, "loss": 7.9609, "loss/crossentropy": 2.0336960554122925, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24182691425085068, "step": 3308 }, { "epoch": 0.206875, "grad_norm": 3.296875, "grad_norm_var": 0.05927327473958333, "learning_rate": 0.0001, "loss": 8.1031, "loss/crossentropy": 2.2174229621887207, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25793761759996414, "step": 3310 }, { "epoch": 0.207, "grad_norm": 2.859375, "grad_norm_var": 0.0614410400390625, "learning_rate": 0.0001, "loss": 7.5995, "loss/crossentropy": 2.125456750392914, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2864660769701004, "step": 3312 }, { "epoch": 0.207125, "grad_norm": 3.34375, "grad_norm_var": 0.0439453125, "learning_rate": 0.0001, "loss": 7.9563, "loss/crossentropy": 1.951561987400055, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2432233989238739, "step": 3314 }, { "epoch": 0.20725, "grad_norm": 2.875, "grad_norm_var": 0.0423828125, "learning_rate": 0.0001, "loss": 8.1328, "loss/crossentropy": 2.2466469407081604, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.24979883432388306, "step": 3316 }, { "epoch": 0.207375, "grad_norm": 3.046875, "grad_norm_var": 0.033426920572916664, "learning_rate": 0.0001, "loss": 7.9811, "loss/crossentropy": 2.2825279235839844, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.25121283531188965, "step": 3318 }, { "epoch": 0.2075, "grad_norm": 3.03125, "grad_norm_var": 0.03323567708333333, "learning_rate": 0.0001, "loss": 7.7574, "loss/crossentropy": 2.0415857434272766, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.23756401240825653, "step": 3320 }, { "epoch": 0.207625, "grad_norm": 3.140625, "grad_norm_var": 0.02847900390625, "learning_rate": 0.0001, "loss": 7.9265, "loss/crossentropy": 2.3567007780075073, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2739853113889694, "step": 3322 }, { "epoch": 0.20775, "grad_norm": 3.1875, "grad_norm_var": 0.027799479166666665, "learning_rate": 0.0001, "loss": 8.0775, "loss/crossentropy": 2.3674511909484863, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.25960472971200943, "step": 3324 }, { "epoch": 0.207875, "grad_norm": 2.953125, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 8.2336, "loss/crossentropy": 2.417602777481079, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2494887337088585, "step": 3326 }, { "epoch": 0.208, "grad_norm": 2.78125, "grad_norm_var": 0.0252838134765625, "learning_rate": 0.0001, "loss": 7.8444, "loss/crossentropy": 2.1521695852279663, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2497367411851883, "step": 3328 }, { "epoch": 0.208125, "grad_norm": 2.734375, "grad_norm_var": 0.02945556640625, "learning_rate": 0.0001, "loss": 7.5654, "loss/crossentropy": 2.0655051469802856, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2116033211350441, "step": 3330 }, { "epoch": 0.20825, "grad_norm": 2.671875, "grad_norm_var": 0.0319000244140625, "learning_rate": 0.0001, "loss": 7.9378, "loss/crossentropy": 2.1603198051452637, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2504379004240036, "step": 3332 }, { "epoch": 0.208375, "grad_norm": 3.25, "grad_norm_var": 0.04130757649739583, "learning_rate": 0.0001, "loss": 8.1669, "loss/crossentropy": 2.4277652502059937, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25090693682432175, "step": 3334 }, { "epoch": 0.2085, "grad_norm": 2.71875, "grad_norm_var": 0.04576822916666667, "learning_rate": 0.0001, "loss": 7.9141, "loss/crossentropy": 1.8467026352882385, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2204154208302498, "step": 3336 }, { "epoch": 0.208625, "grad_norm": 2.453125, "grad_norm_var": 0.047883097330729166, "learning_rate": 0.0001, "loss": 7.635, "loss/crossentropy": 1.894747018814087, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2181135192513466, "step": 3338 }, { "epoch": 0.20875, "grad_norm": 3.09375, "grad_norm_var": 0.04464518229166667, "learning_rate": 0.0001, "loss": 8.0699, "loss/crossentropy": 2.291446328163147, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2898656576871872, "step": 3340 }, { "epoch": 0.208875, "grad_norm": 2.78125, "grad_norm_var": 0.0597076416015625, "learning_rate": 0.0001, "loss": 8.1058, "loss/crossentropy": 2.1971142292022705, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.21616211533546448, "step": 3342 }, { "epoch": 0.209, "grad_norm": 4.90625, "grad_norm_var": 0.32665913899739585, "learning_rate": 0.0001, "loss": 7.8898, "loss/crossentropy": 1.979128658771515, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.24700318276882172, "step": 3344 }, { "epoch": 0.209125, "grad_norm": 2.890625, "grad_norm_var": 0.32160542805989584, "learning_rate": 0.0001, "loss": 7.9277, "loss/crossentropy": 2.220456123352051, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2789622098207474, "step": 3346 }, { "epoch": 0.20925, "grad_norm": 2.6875, "grad_norm_var": 0.32161356608072916, "learning_rate": 0.0001, "loss": 8.0183, "loss/crossentropy": 2.1002278327941895, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24935103952884674, "step": 3348 }, { "epoch": 0.209375, "grad_norm": 2.890625, "grad_norm_var": 0.3168131510416667, "learning_rate": 0.0001, "loss": 7.9776, "loss/crossentropy": 2.16874760389328, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2457059770822525, "step": 3350 }, { "epoch": 0.2095, "grad_norm": 3.265625, "grad_norm_var": 0.3226552327473958, "learning_rate": 0.0001, "loss": 8.2816, "loss/crossentropy": 2.571222424507141, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28565920889377594, "step": 3352 }, { "epoch": 0.209625, "grad_norm": 3.0, "grad_norm_var": 0.2913736979166667, "learning_rate": 0.0001, "loss": 7.9096, "loss/crossentropy": 2.2421743869781494, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.25503817200660706, "step": 3354 }, { "epoch": 0.20975, "grad_norm": 3.125, "grad_norm_var": 0.2854563395182292, "learning_rate": 0.0001, "loss": 8.1574, "loss/crossentropy": 2.3894670009613037, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24552837759256363, "step": 3356 }, { "epoch": 0.209875, "grad_norm": 3.015625, "grad_norm_var": 0.27294820149739585, "learning_rate": 0.0001, "loss": 7.9998, "loss/crossentropy": 2.24248206615448, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2353578880429268, "step": 3358 }, { "epoch": 0.21, "grad_norm": 3.078125, "grad_norm_var": 0.03362223307291667, "learning_rate": 0.0001, "loss": 7.8671, "loss/crossentropy": 1.8926679491996765, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.22951842844486237, "step": 3360 }, { "epoch": 0.210125, "grad_norm": 3.203125, "grad_norm_var": 0.03689778645833333, "learning_rate": 0.0001, "loss": 8.3227, "loss/crossentropy": 2.385164499282837, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26319050788879395, "step": 3362 }, { "epoch": 0.21025, "grad_norm": 2.71875, "grad_norm_var": 0.0366607666015625, "learning_rate": 0.0001, "loss": 7.6621, "loss/crossentropy": 2.0928040742874146, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.24942705780267715, "step": 3364 }, { "epoch": 0.210375, "grad_norm": 2.90625, "grad_norm_var": 0.0343414306640625, "learning_rate": 0.0001, "loss": 7.849, "loss/crossentropy": 2.344526767730713, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2413758859038353, "step": 3366 }, { "epoch": 0.2105, "grad_norm": 2.953125, "grad_norm_var": 0.0228179931640625, "learning_rate": 0.0001, "loss": 8.0673, "loss/crossentropy": 2.1447715759277344, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2705959975719452, "step": 3368 }, { "epoch": 0.210625, "grad_norm": 2.8125, "grad_norm_var": 0.0254547119140625, "learning_rate": 0.0001, "loss": 7.9141, "loss/crossentropy": 2.4462766647338867, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2481309026479721, "step": 3370 }, { "epoch": 0.21075, "grad_norm": 2.90625, "grad_norm_var": 0.025804646809895835, "learning_rate": 0.0001, "loss": 8.0984, "loss/crossentropy": 2.186795651912689, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2742079943418503, "step": 3372 }, { "epoch": 0.210875, "grad_norm": 2.984375, "grad_norm_var": 0.024137369791666665, "learning_rate": 0.0001, "loss": 7.8166, "loss/crossentropy": 2.2239402532577515, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.22634652256965637, "step": 3374 }, { "epoch": 0.211, "grad_norm": 3.140625, "grad_norm_var": 0.031571451822916666, "learning_rate": 0.0001, "loss": 8.1537, "loss/crossentropy": 2.196621298789978, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.23910781741142273, "step": 3376 }, { "epoch": 0.211125, "grad_norm": 2.984375, "grad_norm_var": 0.02662353515625, "learning_rate": 0.0001, "loss": 8.2953, "loss/crossentropy": 2.341985583305359, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26108188927173615, "step": 3378 }, { "epoch": 0.21125, "grad_norm": 3.046875, "grad_norm_var": 0.03253580729166667, "learning_rate": 0.0001, "loss": 8.0612, "loss/crossentropy": 1.8633220791816711, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2216172069311142, "step": 3380 }, { "epoch": 0.211375, "grad_norm": 2.9375, "grad_norm_var": 0.027171834309895834, "learning_rate": 0.0001, "loss": 7.7821, "loss/crossentropy": 2.4255632162094116, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.22760914266109467, "step": 3382 }, { "epoch": 0.2115, "grad_norm": 2.921875, "grad_norm_var": 0.030110677083333332, "learning_rate": 0.0001, "loss": 7.9979, "loss/crossentropy": 2.2252612113952637, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2598068118095398, "step": 3384 }, { "epoch": 0.211625, "grad_norm": 2.859375, "grad_norm_var": 0.028629557291666666, "learning_rate": 0.0001, "loss": 8.306, "loss/crossentropy": 2.2344504594802856, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2635873109102249, "step": 3386 }, { "epoch": 0.21175, "grad_norm": 2.734375, "grad_norm_var": 0.031656901041666664, "learning_rate": 0.0001, "loss": 8.0063, "loss/crossentropy": 2.519619584083557, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2362290695309639, "step": 3388 }, { "epoch": 0.211875, "grad_norm": 2.953125, "grad_norm_var": 0.03181050618489583, "learning_rate": 0.0001, "loss": 7.9615, "loss/crossentropy": 2.273373603820801, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2335738092660904, "step": 3390 }, { "epoch": 0.212, "grad_norm": 3.171875, "grad_norm_var": 0.029198201497395833, "learning_rate": 0.0001, "loss": 7.913, "loss/crossentropy": 2.204755961894989, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24834905564785004, "step": 3392 }, { "epoch": 0.212125, "grad_norm": 2.953125, "grad_norm_var": 0.029645792643229165, "learning_rate": 0.0001, "loss": 7.9813, "loss/crossentropy": 1.9195227026939392, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2377578169107437, "step": 3394 }, { "epoch": 0.21225, "grad_norm": 2.953125, "grad_norm_var": 0.0137603759765625, "learning_rate": 0.0001, "loss": 8.0121, "loss/crossentropy": 2.3786463737487793, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.26938939839601517, "step": 3396 }, { "epoch": 0.212375, "grad_norm": 2.734375, "grad_norm_var": 0.01666259765625, "learning_rate": 0.0001, "loss": 7.9533, "loss/crossentropy": 2.2316389083862305, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25020691752433777, "step": 3398 }, { "epoch": 0.2125, "grad_norm": 2.828125, "grad_norm_var": 0.0198394775390625, "learning_rate": 0.0001, "loss": 7.869, "loss/crossentropy": 1.9748644828796387, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25132524967193604, "step": 3400 }, { "epoch": 0.212625, "grad_norm": 2.984375, "grad_norm_var": 0.02789306640625, "learning_rate": 0.0001, "loss": 7.6831, "loss/crossentropy": 2.0252427458763123, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24975833296775818, "step": 3402 }, { "epoch": 0.21275, "grad_norm": 2.953125, "grad_norm_var": 0.029618326822916666, "learning_rate": 0.0001, "loss": 7.9034, "loss/crossentropy": 2.0473382472991943, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25338828563690186, "step": 3404 }, { "epoch": 0.212875, "grad_norm": 2.90625, "grad_norm_var": 0.027814737955729165, "learning_rate": 0.0001, "loss": 7.7347, "loss/crossentropy": 2.207484722137451, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.23863370716571808, "step": 3406 }, { "epoch": 0.213, "grad_norm": 2.765625, "grad_norm_var": 0.0248931884765625, "learning_rate": 0.0001, "loss": 8.195, "loss/crossentropy": 2.170145094394684, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.29602935910224915, "step": 3408 }, { "epoch": 0.213125, "grad_norm": 3.1875, "grad_norm_var": 0.0295318603515625, "learning_rate": 0.0001, "loss": 8.0324, "loss/crossentropy": 2.1131831407546997, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24959762394428253, "step": 3410 }, { "epoch": 0.21325, "grad_norm": 2.53125, "grad_norm_var": 0.03762613932291667, "learning_rate": 0.0001, "loss": 7.7096, "loss/crossentropy": 2.288270950317383, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2603284567594528, "step": 3412 }, { "epoch": 0.213375, "grad_norm": 2.828125, "grad_norm_var": 0.03478190104166667, "learning_rate": 0.0001, "loss": 7.8065, "loss/crossentropy": 2.287190794944763, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2407102733850479, "step": 3414 }, { "epoch": 0.2135, "grad_norm": 3.796875, "grad_norm_var": 0.8093821207682291, "learning_rate": 0.0001, "loss": 7.7591, "loss/crossentropy": 1.9977330565452576, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.19386405497789383, "step": 3416 }, { "epoch": 0.213625, "grad_norm": 2.84375, "grad_norm_var": 0.79761962890625, "learning_rate": 0.0001, "loss": 7.8384, "loss/crossentropy": 2.2082529067993164, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.24173694849014282, "step": 3418 }, { "epoch": 0.21375, "grad_norm": 2.828125, "grad_norm_var": 0.7936686197916667, "learning_rate": 0.0001, "loss": 7.8532, "loss/crossentropy": 2.339335799217224, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24014320969581604, "step": 3420 }, { "epoch": 0.213875, "grad_norm": 3.0625, "grad_norm_var": 0.7882771809895833, "learning_rate": 0.0001, "loss": 8.3896, "loss/crossentropy": 2.4154088497161865, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2824305593967438, "step": 3422 }, { "epoch": 0.214, "grad_norm": 2.78125, "grad_norm_var": 0.7893218994140625, "learning_rate": 0.0001, "loss": 7.825, "loss/crossentropy": 2.2216320037841797, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24955695122480392, "step": 3424 }, { "epoch": 0.214125, "grad_norm": 2.71875, "grad_norm_var": 0.8019694010416667, "learning_rate": 0.0001, "loss": 7.8788, "loss/crossentropy": 1.939819097518921, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.22390282154083252, "step": 3426 }, { "epoch": 0.21425, "grad_norm": 2.78125, "grad_norm_var": 0.7844553629557292, "learning_rate": 0.0001, "loss": 7.8952, "loss/crossentropy": 2.2235050201416016, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26434002816677094, "step": 3428 }, { "epoch": 0.214375, "grad_norm": 2.671875, "grad_norm_var": 0.7889394124348958, "learning_rate": 0.0001, "loss": 7.8387, "loss/crossentropy": 2.3295034170150757, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23527751863002777, "step": 3430 }, { "epoch": 0.2145, "grad_norm": 3.046875, "grad_norm_var": 0.03748372395833333, "learning_rate": 0.0001, "loss": 8.3574, "loss/crossentropy": 2.3561675548553467, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.37270253896713257, "step": 3432 }, { "epoch": 0.214625, "grad_norm": 3.09375, "grad_norm_var": 0.037919108072916666, "learning_rate": 0.0001, "loss": 7.8119, "loss/crossentropy": 2.0585622787475586, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2616444379091263, "step": 3434 }, { "epoch": 0.21475, "grad_norm": 3.046875, "grad_norm_var": 0.04185791015625, "learning_rate": 0.0001, "loss": 7.8782, "loss/crossentropy": 2.367709159851074, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2585858106613159, "step": 3436 }, { "epoch": 0.214875, "grad_norm": 2.765625, "grad_norm_var": 0.025886027018229167, "learning_rate": 0.0001, "loss": 7.8544, "loss/crossentropy": 2.2442758083343506, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24075668305158615, "step": 3438 }, { "epoch": 0.215, "grad_norm": 3.015625, "grad_norm_var": 0.025944010416666666, "learning_rate": 0.0001, "loss": 8.0828, "loss/crossentropy": 2.1123217344284058, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25368595123291016, "step": 3440 }, { "epoch": 0.215125, "grad_norm": 2.734375, "grad_norm_var": 0.025536092122395833, "learning_rate": 0.0001, "loss": 7.9401, "loss/crossentropy": 1.9603172540664673, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.23971882462501526, "step": 3442 }, { "epoch": 0.21525, "grad_norm": 2.78125, "grad_norm_var": 0.022175089518229166, "learning_rate": 0.0001, "loss": 8.1695, "loss/crossentropy": 2.120830774307251, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.25150182843208313, "step": 3444 }, { "epoch": 0.215375, "grad_norm": 2.75, "grad_norm_var": 0.020084635416666666, "learning_rate": 0.0001, "loss": 7.777, "loss/crossentropy": 2.046776592731476, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.22750262916088104, "step": 3446 }, { "epoch": 0.2155, "grad_norm": 3.171875, "grad_norm_var": 0.023824055989583332, "learning_rate": 0.0001, "loss": 7.9647, "loss/crossentropy": 2.5669057369232178, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.23348582535982132, "step": 3448 }, { "epoch": 0.215625, "grad_norm": 3.125, "grad_norm_var": 0.025487263997395832, "learning_rate": 0.0001, "loss": 8.2591, "loss/crossentropy": 2.400046944618225, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26721225678920746, "step": 3450 }, { "epoch": 0.21575, "grad_norm": 3.109375, "grad_norm_var": 0.020905558268229166, "learning_rate": 0.0001, "loss": 8.3871, "loss/crossentropy": 2.2039917707443237, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.33292847871780396, "step": 3452 }, { "epoch": 0.215875, "grad_norm": 2.890625, "grad_norm_var": 0.019331868489583334, "learning_rate": 0.0001, "loss": 8.0029, "loss/crossentropy": 2.433542251586914, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26467375457286835, "step": 3454 }, { "epoch": 0.216, "grad_norm": 3.09375, "grad_norm_var": 0.019820149739583334, "learning_rate": 0.0001, "loss": 8.114, "loss/crossentropy": 2.0972259640693665, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.264427974820137, "step": 3456 }, { "epoch": 0.216125, "grad_norm": 2.9375, "grad_norm_var": 0.015672810872395835, "learning_rate": 0.0001, "loss": 8.1114, "loss/crossentropy": 2.1838728189468384, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.24506963789463043, "step": 3458 }, { "epoch": 0.21625, "grad_norm": 2.828125, "grad_norm_var": 0.014501953125, "learning_rate": 0.0001, "loss": 7.8676, "loss/crossentropy": 2.3133649826049805, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2419988438487053, "step": 3460 }, { "epoch": 0.216375, "grad_norm": 3.015625, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 8.0063, "loss/crossentropy": 2.047803819179535, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.24520711600780487, "step": 3462 }, { "epoch": 0.2165, "grad_norm": 3.0625, "grad_norm_var": 0.016206868489583335, "learning_rate": 0.0001, "loss": 7.8141, "loss/crossentropy": 2.121231496334076, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2616460770368576, "step": 3464 }, { "epoch": 0.216625, "grad_norm": 3.03125, "grad_norm_var": 0.0185943603515625, "learning_rate": 0.0001, "loss": 8.1526, "loss/crossentropy": 2.279896855354309, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27515122294425964, "step": 3466 }, { "epoch": 0.21675, "grad_norm": 2.703125, "grad_norm_var": 0.02138671875, "learning_rate": 0.0001, "loss": 7.7578, "loss/crossentropy": 2.414362072944641, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.23437850922346115, "step": 3468 }, { "epoch": 0.216875, "grad_norm": 3.375, "grad_norm_var": 0.0339508056640625, "learning_rate": 0.0001, "loss": 7.9469, "loss/crossentropy": 2.2880258560180664, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2520551159977913, "step": 3470 }, { "epoch": 0.217, "grad_norm": 2.796875, "grad_norm_var": 0.0338287353515625, "learning_rate": 0.0001, "loss": 8.0239, "loss/crossentropy": 2.324075937271118, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2632023096084595, "step": 3472 }, { "epoch": 0.217125, "grad_norm": 2.9375, "grad_norm_var": 0.041585286458333336, "learning_rate": 0.0001, "loss": 7.6075, "loss/crossentropy": 1.9836289286613464, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.22227664291858673, "step": 3474 }, { "epoch": 0.21725, "grad_norm": 2.90625, "grad_norm_var": 0.041112263997395836, "learning_rate": 0.0001, "loss": 7.7307, "loss/crossentropy": 1.9572631120681763, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.23236718773841858, "step": 3476 }, { "epoch": 0.217375, "grad_norm": 3.09375, "grad_norm_var": 0.045832316080729164, "learning_rate": 0.0001, "loss": 7.7549, "loss/crossentropy": 2.4078683853149414, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2550060302019119, "step": 3478 }, { "epoch": 0.2175, "grad_norm": 2.5625, "grad_norm_var": 0.051667277018229166, "learning_rate": 0.0001, "loss": 7.7732, "loss/crossentropy": 2.074913501739502, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2501368671655655, "step": 3480 }, { "epoch": 0.217625, "grad_norm": 2.90625, "grad_norm_var": 0.04248758951822917, "learning_rate": 0.0001, "loss": 7.9332, "loss/crossentropy": 2.203160047531128, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2590165138244629, "step": 3482 }, { "epoch": 0.21775, "grad_norm": 2.71875, "grad_norm_var": 0.04394124348958333, "learning_rate": 0.0001, "loss": 7.7392, "loss/crossentropy": 2.1751861572265625, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24029675871133804, "step": 3484 }, { "epoch": 0.217875, "grad_norm": 2.828125, "grad_norm_var": 0.026073201497395834, "learning_rate": 0.0001, "loss": 7.9125, "loss/crossentropy": 2.310207962989807, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2397160306572914, "step": 3486 }, { "epoch": 0.218, "grad_norm": 2.75, "grad_norm_var": 0.028034464518229166, "learning_rate": 0.0001, "loss": 8.0012, "loss/crossentropy": 2.400876522064209, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.25733601301908493, "step": 3488 }, { "epoch": 0.218125, "grad_norm": 2.796875, "grad_norm_var": 0.024950154622395835, "learning_rate": 0.0001, "loss": 7.8326, "loss/crossentropy": 1.9682669043540955, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2510061264038086, "step": 3490 }, { "epoch": 0.21825, "grad_norm": 2.859375, "grad_norm_var": 0.027342732747395834, "learning_rate": 0.0001, "loss": 7.6535, "loss/crossentropy": 2.4174903631210327, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23475679010152817, "step": 3492 }, { "epoch": 0.218375, "grad_norm": 2.96875, "grad_norm_var": 0.0249664306640625, "learning_rate": 0.0001, "loss": 8.2081, "loss/crossentropy": 2.036670744419098, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2724669277667999, "step": 3494 }, { "epoch": 0.2185, "grad_norm": 2.890625, "grad_norm_var": 0.0192779541015625, "learning_rate": 0.0001, "loss": 7.8407, "loss/crossentropy": 2.4288254976272583, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2707410156726837, "step": 3496 }, { "epoch": 0.218625, "grad_norm": 2.6875, "grad_norm_var": 0.021256510416666666, "learning_rate": 0.0001, "loss": 7.9754, "loss/crossentropy": 1.9026321172714233, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.22054974734783173, "step": 3498 }, { "epoch": 0.21875, "grad_norm": 3.015625, "grad_norm_var": 0.019559733072916665, "learning_rate": 0.0001, "loss": 7.9397, "loss/crossentropy": 2.229159712791443, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.24547549337148666, "step": 3500 }, { "epoch": 0.218875, "grad_norm": 2.890625, "grad_norm_var": 0.02135009765625, "learning_rate": 0.0001, "loss": 7.9211, "loss/crossentropy": 2.131072163581848, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2316766083240509, "step": 3502 }, { "epoch": 0.219, "grad_norm": 2.828125, "grad_norm_var": 0.019527180989583334, "learning_rate": 0.0001, "loss": 7.9834, "loss/crossentropy": 2.373594045639038, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26772065460681915, "step": 3504 }, { "epoch": 0.219125, "grad_norm": 3.015625, "grad_norm_var": 0.01871337890625, "learning_rate": 0.0001, "loss": 8.0302, "loss/crossentropy": 2.212057948112488, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23775693774223328, "step": 3506 }, { "epoch": 0.21925, "grad_norm": 2.71875, "grad_norm_var": 0.0160552978515625, "learning_rate": 0.0001, "loss": 7.9471, "loss/crossentropy": 2.4221761226654053, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.22958316653966904, "step": 3508 }, { "epoch": 0.219375, "grad_norm": 3.03125, "grad_norm_var": 0.012165323893229166, "learning_rate": 0.0001, "loss": 8.0583, "loss/crossentropy": 2.3343610763549805, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26760047674179077, "step": 3510 }, { "epoch": 0.2195, "grad_norm": 2.78125, "grad_norm_var": 0.0125, "learning_rate": 0.0001, "loss": 8.0273, "loss/crossentropy": 2.1401559114456177, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.23956479877233505, "step": 3512 }, { "epoch": 0.219625, "grad_norm": 3.0, "grad_norm_var": 0.010807291666666666, "learning_rate": 0.0001, "loss": 7.7942, "loss/crossentropy": 2.2062328457832336, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23117581009864807, "step": 3514 }, { "epoch": 0.21975, "grad_norm": 2.671875, "grad_norm_var": 0.013206990559895833, "learning_rate": 0.0001, "loss": 7.6302, "loss/crossentropy": 2.128235101699829, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23166847974061966, "step": 3516 }, { "epoch": 0.219875, "grad_norm": 2.5625, "grad_norm_var": 0.017606608072916665, "learning_rate": 0.0001, "loss": 7.984, "loss/crossentropy": 2.2151215076446533, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2790476083755493, "step": 3518 }, { "epoch": 0.22, "grad_norm": 3.03125, "grad_norm_var": 0.020963541666666665, "learning_rate": 0.0001, "loss": 7.8469, "loss/crossentropy": 2.0416505932807922, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.23504749685525894, "step": 3520 }, { "epoch": 0.220125, "grad_norm": 2.59375, "grad_norm_var": 0.024983723958333332, "learning_rate": 0.0001, "loss": 7.8135, "loss/crossentropy": 2.001868188381195, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2242003232240677, "step": 3522 }, { "epoch": 0.22025, "grad_norm": 3.015625, "grad_norm_var": 0.046773274739583336, "learning_rate": 0.0001, "loss": 8.1101, "loss/crossentropy": 2.162585973739624, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.23777949810028076, "step": 3524 }, { "epoch": 0.220375, "grad_norm": 3.765625, "grad_norm_var": 0.09345296223958334, "learning_rate": 0.0001, "loss": 7.8369, "loss/crossentropy": 2.3255455493927, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25327935814857483, "step": 3526 }, { "epoch": 0.2205, "grad_norm": 2.8125, "grad_norm_var": 0.09275716145833333, "learning_rate": 0.0001, "loss": 7.8281, "loss/crossentropy": 2.231058359146118, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.23266196250915527, "step": 3528 }, { "epoch": 0.220625, "grad_norm": 3.09375, "grad_norm_var": 0.09185791015625, "learning_rate": 0.0001, "loss": 7.9563, "loss/crossentropy": 2.0914143919944763, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.272638276219368, "step": 3530 }, { "epoch": 0.22075, "grad_norm": 3.046875, "grad_norm_var": 0.08422749837239583, "learning_rate": 0.0001, "loss": 7.8811, "loss/crossentropy": 2.2898030281066895, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.25423404574394226, "step": 3532 }, { "epoch": 0.220875, "grad_norm": 3.734375, "grad_norm_var": 0.09142252604166666, "learning_rate": 0.0001, "loss": 7.9159, "loss/crossentropy": 2.14033579826355, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.295635849237442, "step": 3534 }, { "epoch": 0.221, "grad_norm": 3.046875, "grad_norm_var": 0.09573160807291667, "learning_rate": 0.0001, "loss": 8.1543, "loss/crossentropy": 2.2987574338912964, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.29592613875865936, "step": 3536 }, { "epoch": 0.221125, "grad_norm": 2.921875, "grad_norm_var": 0.08795572916666666, "learning_rate": 0.0001, "loss": 7.9903, "loss/crossentropy": 2.4567188024520874, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24662955850362778, "step": 3538 }, { "epoch": 0.22125, "grad_norm": 2.796875, "grad_norm_var": 0.12490946451822917, "learning_rate": 0.0001, "loss": 7.9305, "loss/crossentropy": 2.3008846044540405, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.27361243963241577, "step": 3540 }, { "epoch": 0.221375, "grad_norm": 2.65625, "grad_norm_var": 0.1101470947265625, "learning_rate": 0.0001, "loss": 7.872, "loss/crossentropy": 2.522549033164978, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2551089823246002, "step": 3542 }, { "epoch": 0.2215, "grad_norm": 3.359375, "grad_norm_var": 0.1140777587890625, "learning_rate": 0.0001, "loss": 8.0564, "loss/crossentropy": 2.022073805332184, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2375979870557785, "step": 3544 }, { "epoch": 0.221625, "grad_norm": 2.546875, "grad_norm_var": 0.13304036458333332, "learning_rate": 0.0001, "loss": 7.6601, "loss/crossentropy": 2.1095418334007263, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.21154537796974182, "step": 3546 }, { "epoch": 0.22175, "grad_norm": 2.953125, "grad_norm_var": 0.13742574055989584, "learning_rate": 0.0001, "loss": 7.8642, "loss/crossentropy": 2.3745577335357666, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24417676031589508, "step": 3548 }, { "epoch": 0.221875, "grad_norm": 2.703125, "grad_norm_var": 0.09863179524739583, "learning_rate": 0.0001, "loss": 7.9062, "loss/crossentropy": 2.2617377042770386, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2600301057100296, "step": 3550 }, { "epoch": 0.222, "grad_norm": 2.921875, "grad_norm_var": 0.16064351399739582, "learning_rate": 0.0001, "loss": 7.8102, "loss/crossentropy": 2.2537072896957397, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.22177107632160187, "step": 3552 }, { "epoch": 0.222125, "grad_norm": 2.84375, "grad_norm_var": 0.15888264973958333, "learning_rate": 0.0001, "loss": 7.7759, "loss/crossentropy": 2.1810015439987183, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2299894094467163, "step": 3554 }, { "epoch": 0.22225, "grad_norm": 2.984375, "grad_norm_var": 0.10131734212239583, "learning_rate": 0.0001, "loss": 7.9914, "loss/crossentropy": 2.1778082847595215, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2700965404510498, "step": 3556 }, { "epoch": 0.222375, "grad_norm": 2.8125, "grad_norm_var": 0.0957427978515625, "learning_rate": 0.0001, "loss": 8.0165, "loss/crossentropy": 2.384767770767212, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2815344035625458, "step": 3558 }, { "epoch": 0.2225, "grad_norm": 2.859375, "grad_norm_var": 0.08655598958333334, "learning_rate": 0.0001, "loss": 8.0481, "loss/crossentropy": 2.166767954826355, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2488117441534996, "step": 3560 }, { "epoch": 0.222625, "grad_norm": 3.21875, "grad_norm_var": 0.07814839680989584, "learning_rate": 0.0001, "loss": 8.0776, "loss/crossentropy": 2.278902769088745, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2430027276277542, "step": 3562 }, { "epoch": 0.22275, "grad_norm": 2.828125, "grad_norm_var": 0.0761871337890625, "learning_rate": 0.0001, "loss": 7.962, "loss/crossentropy": 2.3891888856887817, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2750861644744873, "step": 3564 }, { "epoch": 0.222875, "grad_norm": 2.890625, "grad_norm_var": 0.07242838541666667, "learning_rate": 0.0001, "loss": 7.916, "loss/crossentropy": 2.3488543033599854, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2557990550994873, "step": 3566 }, { "epoch": 0.223, "grad_norm": 2.78125, "grad_norm_var": 0.014713541666666666, "learning_rate": 0.0001, "loss": 8.13, "loss/crossentropy": 2.264247179031372, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2558897137641907, "step": 3568 }, { "epoch": 0.223125, "grad_norm": 2.875, "grad_norm_var": 0.014176432291666667, "learning_rate": 0.0001, "loss": 8.1117, "loss/crossentropy": 2.273573875427246, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28941208124160767, "step": 3570 }, { "epoch": 0.22325, "grad_norm": 2.8125, "grad_norm_var": 0.018529256184895832, "learning_rate": 0.0001, "loss": 7.7914, "loss/crossentropy": 2.2895262241363525, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.22844338417053223, "step": 3572 }, { "epoch": 0.223375, "grad_norm": 3.046875, "grad_norm_var": 0.020344034830729166, "learning_rate": 0.0001, "loss": 7.8944, "loss/crossentropy": 2.2501152753829956, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2558812201023102, "step": 3574 }, { "epoch": 0.2235, "grad_norm": 2.796875, "grad_norm_var": 0.019124348958333332, "learning_rate": 0.0001, "loss": 7.7574, "loss/crossentropy": 2.137208878993988, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.22356164455413818, "step": 3576 }, { "epoch": 0.223625, "grad_norm": 2.875, "grad_norm_var": 0.011747233072916667, "learning_rate": 0.0001, "loss": 8.0103, "loss/crossentropy": 2.3239294290542603, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24761919677257538, "step": 3578 }, { "epoch": 0.22375, "grad_norm": 3.171875, "grad_norm_var": 0.028563435872395834, "learning_rate": 0.0001, "loss": 8.1776, "loss/crossentropy": 2.2571107149124146, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24413985013961792, "step": 3580 }, { "epoch": 0.223875, "grad_norm": 2.765625, "grad_norm_var": 0.0296875, "learning_rate": 0.0001, "loss": 7.9054, "loss/crossentropy": 2.376944899559021, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.25766023993492126, "step": 3582 }, { "epoch": 0.224, "grad_norm": 3.09375, "grad_norm_var": 0.0364898681640625, "learning_rate": 0.0001, "loss": 7.9002, "loss/crossentropy": 2.48104989528656, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2584661394357681, "step": 3584 }, { "epoch": 0.224125, "grad_norm": 3.109375, "grad_norm_var": 0.03720703125, "learning_rate": 0.0001, "loss": 7.9624, "loss/crossentropy": 2.4197115898132324, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2563299685716629, "step": 3586 }, { "epoch": 0.22425, "grad_norm": 3.5625, "grad_norm_var": 0.0471832275390625, "learning_rate": 0.0001, "loss": 8.1088, "loss/crossentropy": 2.2639349699020386, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27292385697364807, "step": 3588 }, { "epoch": 0.224375, "grad_norm": 3.25, "grad_norm_var": 0.08664957682291667, "learning_rate": 0.0001, "loss": 7.9256, "loss/crossentropy": 2.542632579803467, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.27380509674549103, "step": 3590 }, { "epoch": 0.2245, "grad_norm": 3.046875, "grad_norm_var": 0.0756500244140625, "learning_rate": 0.0001, "loss": 7.9534, "loss/crossentropy": 2.0638725757598877, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2678438723087311, "step": 3592 }, { "epoch": 0.224625, "grad_norm": 2.734375, "grad_norm_var": 0.0768707275390625, "learning_rate": 0.0001, "loss": 7.9669, "loss/crossentropy": 2.204024314880371, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27217431366443634, "step": 3594 }, { "epoch": 0.22475, "grad_norm": 3.09375, "grad_norm_var": 0.07506510416666666, "learning_rate": 0.0001, "loss": 7.9581, "loss/crossentropy": 2.1799510717391968, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2552003934979439, "step": 3596 }, { "epoch": 0.224875, "grad_norm": 2.53125, "grad_norm_var": 0.08918863932291667, "learning_rate": 0.0001, "loss": 7.9478, "loss/crossentropy": 2.350053310394287, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2713748663663864, "step": 3598 }, { "epoch": 0.225, "grad_norm": 2.84375, "grad_norm_var": 0.09192301432291666, "learning_rate": 0.0001, "loss": 8.0373, "loss/crossentropy": 2.5646886825561523, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.252247117459774, "step": 3600 }, { "epoch": 0.225125, "grad_norm": 2.75, "grad_norm_var": 0.1020904541015625, "learning_rate": 0.0001, "loss": 7.7359, "loss/crossentropy": 2.331313371658325, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24355339258909225, "step": 3602 }, { "epoch": 0.22525, "grad_norm": 3.03125, "grad_norm_var": 0.09511311848958333, "learning_rate": 0.0001, "loss": 7.557, "loss/crossentropy": 2.099987030029297, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2239885851740837, "step": 3604 }, { "epoch": 0.225375, "grad_norm": 3.109375, "grad_norm_var": 0.04091695149739583, "learning_rate": 0.0001, "loss": 8.0773, "loss/crossentropy": 2.166841745376587, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2539377585053444, "step": 3606 }, { "epoch": 0.2255, "grad_norm": 2.640625, "grad_norm_var": 0.04342041015625, "learning_rate": 0.0001, "loss": 7.8462, "loss/crossentropy": 2.2023102045059204, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2344854474067688, "step": 3608 }, { "epoch": 0.225625, "grad_norm": 3.671875, "grad_norm_var": 0.07916259765625, "learning_rate": 0.0001, "loss": 8.127, "loss/crossentropy": 2.5000699758529663, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2570355013012886, "step": 3610 }, { "epoch": 0.22575, "grad_norm": 2.84375, "grad_norm_var": 0.0775390625, "learning_rate": 0.0001, "loss": 7.9756, "loss/crossentropy": 2.0878546237945557, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23281921446323395, "step": 3612 }, { "epoch": 0.225875, "grad_norm": 2.734375, "grad_norm_var": 0.064892578125, "learning_rate": 0.0001, "loss": 7.7364, "loss/crossentropy": 2.080467939376831, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2381865382194519, "step": 3614 }, { "epoch": 0.226, "grad_norm": 2.75, "grad_norm_var": 0.0649566650390625, "learning_rate": 0.0001, "loss": 7.9112, "loss/crossentropy": 2.0848032236099243, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24008891731500626, "step": 3616 }, { "epoch": 0.226125, "grad_norm": 2.890625, "grad_norm_var": 0.06313374837239584, "learning_rate": 0.0001, "loss": 8.0448, "loss/crossentropy": 2.6174436807632446, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.23969924449920654, "step": 3618 }, { "epoch": 0.22625, "grad_norm": 3.5, "grad_norm_var": 0.07757059733072917, "learning_rate": 0.0001, "loss": 7.9928, "loss/crossentropy": 2.1519758701324463, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23642629384994507, "step": 3620 }, { "epoch": 0.226375, "grad_norm": 3.25, "grad_norm_var": 0.08548177083333333, "learning_rate": 0.0001, "loss": 7.8108, "loss/crossentropy": 2.118536949157715, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2450103908777237, "step": 3622 }, { "epoch": 0.2265, "grad_norm": 3.109375, "grad_norm_var": 0.08001302083333334, "learning_rate": 0.0001, "loss": 8.1292, "loss/crossentropy": 2.1092581748962402, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24913781136274338, "step": 3624 }, { "epoch": 0.226625, "grad_norm": 3.25, "grad_norm_var": 0.05774739583333333, "learning_rate": 0.0001, "loss": 7.7808, "loss/crossentropy": 2.242541193962097, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.236085943877697, "step": 3626 }, { "epoch": 0.22675, "grad_norm": 3.015625, "grad_norm_var": 0.056966145833333336, "learning_rate": 0.0001, "loss": 7.9217, "loss/crossentropy": 2.288723349571228, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26326847821474075, "step": 3628 }, { "epoch": 0.226875, "grad_norm": 2.984375, "grad_norm_var": 0.04830322265625, "learning_rate": 0.0001, "loss": 8.0331, "loss/crossentropy": 2.0977364778518677, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25680290162563324, "step": 3630 }, { "epoch": 0.227, "grad_norm": 2.96875, "grad_norm_var": 0.046219889322916666, "learning_rate": 0.0001, "loss": 8.081, "loss/crossentropy": 2.281399726867676, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2612576484680176, "step": 3632 }, { "epoch": 0.227125, "grad_norm": 3.265625, "grad_norm_var": 0.046442667643229164, "learning_rate": 0.0001, "loss": 7.8859, "loss/crossentropy": 2.1967058181762695, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2603886350989342, "step": 3634 }, { "epoch": 0.22725, "grad_norm": 2.90625, "grad_norm_var": 0.03575846354166667, "learning_rate": 0.0001, "loss": 8.2296, "loss/crossentropy": 2.211125612258911, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27175895124673843, "step": 3636 }, { "epoch": 0.227375, "grad_norm": 2.875, "grad_norm_var": 0.029596964518229168, "learning_rate": 0.0001, "loss": 7.7215, "loss/crossentropy": 2.314815878868103, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25243079662323, "step": 3638 }, { "epoch": 0.2275, "grad_norm": 3.015625, "grad_norm_var": 0.027229817708333333, "learning_rate": 0.0001, "loss": 7.9264, "loss/crossentropy": 2.7343060970306396, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2654494494199753, "step": 3640 }, { "epoch": 0.227625, "grad_norm": 2.75, "grad_norm_var": 0.025927734375, "learning_rate": 0.0001, "loss": 7.8579, "loss/crossentropy": 2.0919711589813232, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2338973432779312, "step": 3642 }, { "epoch": 0.22775, "grad_norm": 2.9375, "grad_norm_var": 0.026610310872395834, "learning_rate": 0.0001, "loss": 8.0756, "loss/crossentropy": 2.185083746910095, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2376946359872818, "step": 3644 }, { "epoch": 0.227875, "grad_norm": 2.890625, "grad_norm_var": 0.038117472330729166, "learning_rate": 0.0001, "loss": 7.9083, "loss/crossentropy": 2.3631292581558228, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.25929248332977295, "step": 3646 }, { "epoch": 0.228, "grad_norm": 2.703125, "grad_norm_var": 0.039013671875, "learning_rate": 0.0001, "loss": 7.9839, "loss/crossentropy": 2.4326263666152954, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24526787549257278, "step": 3648 }, { "epoch": 0.228125, "grad_norm": 3.046875, "grad_norm_var": 0.033589680989583336, "learning_rate": 0.0001, "loss": 8.008, "loss/crossentropy": 2.2084107398986816, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2918578237295151, "step": 3650 }, { "epoch": 0.22825, "grad_norm": 2.8125, "grad_norm_var": 0.02818603515625, "learning_rate": 0.0001, "loss": 7.6168, "loss/crossentropy": 2.044155180454254, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26035064458847046, "step": 3652 }, { "epoch": 0.228375, "grad_norm": 2.734375, "grad_norm_var": 0.0298980712890625, "learning_rate": 0.0001, "loss": 7.6004, "loss/crossentropy": 1.9853056073188782, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2116970717906952, "step": 3654 }, { "epoch": 0.2285, "grad_norm": 2.859375, "grad_norm_var": 0.029325358072916665, "learning_rate": 0.0001, "loss": 7.9553, "loss/crossentropy": 2.3441128730773926, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.24771487712860107, "step": 3656 }, { "epoch": 0.228625, "grad_norm": 2.828125, "grad_norm_var": 0.026383463541666666, "learning_rate": 0.0001, "loss": 7.8984, "loss/crossentropy": 2.1385873556137085, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.274789422750473, "step": 3658 }, { "epoch": 0.22875, "grad_norm": 2.84375, "grad_norm_var": 0.02623291015625, "learning_rate": 0.0001, "loss": 7.8789, "loss/crossentropy": 2.2766119241714478, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25663213431835175, "step": 3660 }, { "epoch": 0.228875, "grad_norm": 3.09375, "grad_norm_var": 0.0163726806640625, "learning_rate": 0.0001, "loss": 7.8729, "loss/crossentropy": 2.143479347229004, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.22755863517522812, "step": 3662 }, { "epoch": 0.229, "grad_norm": 3.390625, "grad_norm_var": 0.0333160400390625, "learning_rate": 0.0001, "loss": 7.8351, "loss/crossentropy": 2.130086302757263, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25640900433063507, "step": 3664 }, { "epoch": 0.229125, "grad_norm": 2.828125, "grad_norm_var": 0.031004842122395834, "learning_rate": 0.0001, "loss": 8.0049, "loss/crossentropy": 1.9736462235450745, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.22910848259925842, "step": 3666 }, { "epoch": 0.22925, "grad_norm": 2.71875, "grad_norm_var": 0.032957967122395834, "learning_rate": 0.0001, "loss": 7.7042, "loss/crossentropy": 1.9823258519172668, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24188224971294403, "step": 3668 }, { "epoch": 0.229375, "grad_norm": 2.796875, "grad_norm_var": 0.033356730143229166, "learning_rate": 0.0001, "loss": 7.873, "loss/crossentropy": 2.193943977355957, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.25041549652814865, "step": 3670 }, { "epoch": 0.2295, "grad_norm": 2.75, "grad_norm_var": 0.034300740559895834, "learning_rate": 0.0001, "loss": 7.8963, "loss/crossentropy": 2.0905851125717163, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24181322753429413, "step": 3672 }, { "epoch": 0.229625, "grad_norm": 2.953125, "grad_norm_var": 0.04078369140625, "learning_rate": 0.0001, "loss": 7.6612, "loss/crossentropy": 2.074888229370117, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24967533349990845, "step": 3674 }, { "epoch": 0.22975, "grad_norm": 3.1875, "grad_norm_var": 0.04593098958333333, "learning_rate": 0.0001, "loss": 8.182, "loss/crossentropy": 2.4388426542282104, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2701159864664078, "step": 3676 }, { "epoch": 0.229875, "grad_norm": 2.75, "grad_norm_var": 0.04246317545572917, "learning_rate": 0.0001, "loss": 7.9407, "loss/crossentropy": 2.3165522813796997, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2641128599643707, "step": 3678 }, { "epoch": 0.23, "grad_norm": 2.9375, "grad_norm_var": 0.028075154622395834, "learning_rate": 0.0001, "loss": 7.8861, "loss/crossentropy": 2.1456260681152344, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.25554676353931427, "step": 3680 }, { "epoch": 0.230125, "grad_norm": 2.640625, "grad_norm_var": 0.031966145833333334, "learning_rate": 0.0001, "loss": 7.801, "loss/crossentropy": 1.9411519765853882, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.22800318896770477, "step": 3682 }, { "epoch": 0.23025, "grad_norm": 2.671875, "grad_norm_var": 0.0322906494140625, "learning_rate": 0.0001, "loss": 7.9874, "loss/crossentropy": 2.5022239685058594, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.22672583162784576, "step": 3684 }, { "epoch": 0.230375, "grad_norm": 2.71875, "grad_norm_var": 0.03371480305989583, "learning_rate": 0.0001, "loss": 8.0547, "loss/crossentropy": 2.3385313749313354, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.250342458486557, "step": 3686 }, { "epoch": 0.2305, "grad_norm": 2.96875, "grad_norm_var": 0.0328521728515625, "learning_rate": 0.0001, "loss": 7.8317, "loss/crossentropy": 2.262456774711609, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2567814439535141, "step": 3688 }, { "epoch": 0.230625, "grad_norm": 3.015625, "grad_norm_var": 0.028246053059895835, "learning_rate": 0.0001, "loss": 8.0957, "loss/crossentropy": 2.162911057472229, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2832511365413666, "step": 3690 }, { "epoch": 0.23075, "grad_norm": 2.734375, "grad_norm_var": 0.037646484375, "learning_rate": 0.0001, "loss": 7.9, "loss/crossentropy": 2.0919874906539917, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2599126175045967, "step": 3692 }, { "epoch": 0.230875, "grad_norm": 2.765625, "grad_norm_var": 0.034544881184895834, "learning_rate": 0.0001, "loss": 8.2518, "loss/crossentropy": 2.3653175830841064, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2551597058773041, "step": 3694 }, { "epoch": 0.231, "grad_norm": 3.0625, "grad_norm_var": 0.03876546223958333, "learning_rate": 0.0001, "loss": 7.7356, "loss/crossentropy": 2.008529305458069, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.23333938419818878, "step": 3696 }, { "epoch": 0.231125, "grad_norm": 2.921875, "grad_norm_var": 0.030159505208333333, "learning_rate": 0.0001, "loss": 8.0322, "loss/crossentropy": 2.4657877683639526, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25284393876791, "step": 3698 }, { "epoch": 0.23125, "grad_norm": 6.84375, "grad_norm_var": 0.9776112874348958, "learning_rate": 0.0001, "loss": 8.0716, "loss/crossentropy": 2.190954327583313, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2405833974480629, "step": 3700 }, { "epoch": 0.231375, "grad_norm": 2.8125, "grad_norm_var": 0.9728179931640625, "learning_rate": 0.0001, "loss": 7.9139, "loss/crossentropy": 2.2382062673568726, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2504802420735359, "step": 3702 }, { "epoch": 0.2315, "grad_norm": 3.15625, "grad_norm_var": 0.9693756103515625, "learning_rate": 0.0001, "loss": 8.1178, "loss/crossentropy": 2.4467432498931885, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23961085081100464, "step": 3704 }, { "epoch": 0.231625, "grad_norm": 3.328125, "grad_norm_var": 0.9597941080729167, "learning_rate": 0.0001, "loss": 8.1038, "loss/crossentropy": 2.1803176403045654, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.265427365899086, "step": 3706 }, { "epoch": 0.23175, "grad_norm": 2.984375, "grad_norm_var": 0.9465484619140625, "learning_rate": 0.0001, "loss": 8.0725, "loss/crossentropy": 2.045309603214264, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.23161526769399643, "step": 3708 }, { "epoch": 0.231875, "grad_norm": 2.859375, "grad_norm_var": 0.9664296468098958, "learning_rate": 0.0001, "loss": 7.762, "loss/crossentropy": 2.1329755783081055, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.21435976773500443, "step": 3710 }, { "epoch": 0.232, "grad_norm": 2.796875, "grad_norm_var": 0.9795806884765625, "learning_rate": 0.0001, "loss": 7.8622, "loss/crossentropy": 2.332452416419983, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.24902824312448502, "step": 3712 }, { "epoch": 0.232125, "grad_norm": 2.828125, "grad_norm_var": 0.98232421875, "learning_rate": 0.0001, "loss": 7.8737, "loss/crossentropy": 2.1177918314933777, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23020614683628082, "step": 3714 }, { "epoch": 0.23225, "grad_norm": 3.046875, "grad_norm_var": 0.041259765625, "learning_rate": 0.0001, "loss": 8.1088, "loss/crossentropy": 2.5457119941711426, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24084428697824478, "step": 3716 }, { "epoch": 0.232375, "grad_norm": 3.65625, "grad_norm_var": 0.07121988932291666, "learning_rate": 0.0001, "loss": 8.1263, "loss/crossentropy": 2.2667051553726196, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.28153958916664124, "step": 3718 }, { "epoch": 0.2325, "grad_norm": 2.65625, "grad_norm_var": 0.07902730305989583, "learning_rate": 0.0001, "loss": 7.8627, "loss/crossentropy": 2.1701722145080566, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2343793362379074, "step": 3720 }, { "epoch": 0.232625, "grad_norm": 3.65625, "grad_norm_var": 0.6369954427083333, "learning_rate": 0.0001, "loss": 7.8928, "loss/crossentropy": 2.009825825691223, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2613653987646103, "step": 3722 }, { "epoch": 0.23275, "grad_norm": 2.8125, "grad_norm_var": 0.6477203369140625, "learning_rate": 0.0001, "loss": 7.9655, "loss/crossentropy": 1.8538190722465515, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.22898602485656738, "step": 3724 }, { "epoch": 0.232875, "grad_norm": 3.953125, "grad_norm_var": 0.6635080973307291, "learning_rate": 0.0001, "loss": 8.1831, "loss/crossentropy": 2.4460748434066772, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.278794527053833, "step": 3726 }, { "epoch": 0.233, "grad_norm": 3.078125, "grad_norm_var": 0.647119140625, "learning_rate": 0.0001, "loss": 7.9594, "loss/crossentropy": 2.2578943967819214, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.24067480117082596, "step": 3728 }, { "epoch": 0.233125, "grad_norm": 3.015625, "grad_norm_var": 0.6367146809895833, "learning_rate": 0.0001, "loss": 8.1654, "loss/crossentropy": 2.0661072731018066, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2693081870675087, "step": 3730 }, { "epoch": 0.23325, "grad_norm": 2.8125, "grad_norm_var": 0.6377105712890625, "learning_rate": 0.0001, "loss": 7.9926, "loss/crossentropy": 2.173050284385681, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25787366926670074, "step": 3732 }, { "epoch": 0.233375, "grad_norm": 3.234375, "grad_norm_var": 0.6337198893229167, "learning_rate": 0.0001, "loss": 8.0724, "loss/crossentropy": 2.2620134353637695, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2431875318288803, "step": 3734 }, { "epoch": 0.2335, "grad_norm": 2.8125, "grad_norm_var": 0.6274373372395833, "learning_rate": 0.0001, "loss": 7.8573, "loss/crossentropy": 2.387823700904846, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24485048651695251, "step": 3736 }, { "epoch": 0.233625, "grad_norm": 2.640625, "grad_norm_var": 0.09532877604166666, "learning_rate": 0.0001, "loss": 8.0269, "loss/crossentropy": 2.315932035446167, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2529396787285805, "step": 3738 }, { "epoch": 0.23375, "grad_norm": 2.890625, "grad_norm_var": 0.09357096354166666, "learning_rate": 0.0001, "loss": 7.9224, "loss/crossentropy": 2.360776424407959, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2630798667669296, "step": 3740 }, { "epoch": 0.233875, "grad_norm": 2.65625, "grad_norm_var": 0.030882771809895834, "learning_rate": 0.0001, "loss": 7.7428, "loss/crossentropy": 2.1258424520492554, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2373504936695099, "step": 3742 }, { "epoch": 0.234, "grad_norm": 2.6875, "grad_norm_var": 0.027619425455729166, "learning_rate": 0.0001, "loss": 7.8262, "loss/crossentropy": 2.3645920753479004, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2592678740620613, "step": 3744 }, { "epoch": 0.234125, "grad_norm": 2.84375, "grad_norm_var": 0.026041666666666668, "learning_rate": 0.0001, "loss": 7.6269, "loss/crossentropy": 2.185905337333679, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2242574542760849, "step": 3746 }, { "epoch": 0.23425, "grad_norm": 2.828125, "grad_norm_var": 0.027392578125, "learning_rate": 0.0001, "loss": 7.9114, "loss/crossentropy": 2.2320408821105957, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2445889264345169, "step": 3748 }, { "epoch": 0.234375, "grad_norm": 3.4375, "grad_norm_var": 0.0395416259765625, "learning_rate": 0.0001, "loss": 7.9443, "loss/crossentropy": 2.1563917994499207, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24374403059482574, "step": 3750 }, { "epoch": 0.2345, "grad_norm": 2.875, "grad_norm_var": 0.039774576822916664, "learning_rate": 0.0001, "loss": 8.0078, "loss/crossentropy": 2.2550970315933228, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27107375860214233, "step": 3752 }, { "epoch": 0.234625, "grad_norm": 2.953125, "grad_norm_var": 0.03527018229166667, "learning_rate": 0.0001, "loss": 7.7008, "loss/crossentropy": 1.9745083451271057, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25497596710920334, "step": 3754 }, { "epoch": 0.23475, "grad_norm": 2.625, "grad_norm_var": 0.040160115559895834, "learning_rate": 0.0001, "loss": 7.8099, "loss/crossentropy": 2.337002158164978, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.25781358033418655, "step": 3756 }, { "epoch": 0.234875, "grad_norm": 3.5625, "grad_norm_var": 0.07073160807291666, "learning_rate": 0.0001, "loss": 7.7631, "loss/crossentropy": 2.1787675619125366, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2274041771888733, "step": 3758 }, { "epoch": 0.235, "grad_norm": 3.046875, "grad_norm_var": 0.06876627604166667, "learning_rate": 0.0001, "loss": 7.8394, "loss/crossentropy": 2.342090129852295, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25103210657835007, "step": 3760 }, { "epoch": 0.235125, "grad_norm": 2.8125, "grad_norm_var": 0.06526692708333333, "learning_rate": 0.0001, "loss": 7.8923, "loss/crossentropy": 2.049377143383026, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23541534692049026, "step": 3762 }, { "epoch": 0.23525, "grad_norm": 3.1875, "grad_norm_var": 0.07031962076822916, "learning_rate": 0.0001, "loss": 8.096, "loss/crossentropy": 2.2713589668273926, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27829641103744507, "step": 3764 }, { "epoch": 0.235375, "grad_norm": 2.796875, "grad_norm_var": 0.05662333170572917, "learning_rate": 0.0001, "loss": 7.8781, "loss/crossentropy": 2.324322819709778, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2734615206718445, "step": 3766 }, { "epoch": 0.2355, "grad_norm": 2.890625, "grad_norm_var": 0.05676676432291667, "learning_rate": 0.0001, "loss": 8.1779, "loss/crossentropy": 2.591456174850464, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2322877123951912, "step": 3768 }, { "epoch": 0.235625, "grad_norm": 3.4375, "grad_norm_var": 0.07343343098958334, "learning_rate": 0.0001, "loss": 7.9672, "loss/crossentropy": 2.1592308282852173, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27204084396362305, "step": 3770 }, { "epoch": 0.23575, "grad_norm": 2.640625, "grad_norm_var": 0.0701324462890625, "learning_rate": 0.0001, "loss": 8.0189, "loss/crossentropy": 2.0555856823921204, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2383071631193161, "step": 3772 }, { "epoch": 0.235875, "grad_norm": 2.640625, "grad_norm_var": 0.14469401041666666, "learning_rate": 0.0001, "loss": 7.996, "loss/crossentropy": 2.4771432876586914, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.25897009670734406, "step": 3774 }, { "epoch": 0.236, "grad_norm": 2.875, "grad_norm_var": 0.14617513020833334, "learning_rate": 0.0001, "loss": 7.8766, "loss/crossentropy": 1.9889416098594666, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24898222088813782, "step": 3776 }, { "epoch": 0.236125, "grad_norm": 3.5, "grad_norm_var": 0.15862528483072916, "learning_rate": 0.0001, "loss": 7.9788, "loss/crossentropy": 2.3927156925201416, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.24354391545057297, "step": 3778 }, { "epoch": 0.23625, "grad_norm": 2.828125, "grad_norm_var": 0.15758056640625, "learning_rate": 0.0001, "loss": 7.8968, "loss/crossentropy": 2.3122246265411377, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.25621072947978973, "step": 3780 }, { "epoch": 0.236375, "grad_norm": 2.890625, "grad_norm_var": 0.15104878743489583, "learning_rate": 0.0001, "loss": 8.085, "loss/crossentropy": 2.251040816307068, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2513684630393982, "step": 3782 }, { "epoch": 0.2365, "grad_norm": 3.75, "grad_norm_var": 0.18613993326822917, "learning_rate": 0.0001, "loss": 8.292, "loss/crossentropy": 2.175575017929077, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.28343138098716736, "step": 3784 }, { "epoch": 0.236625, "grad_norm": 3.1875, "grad_norm_var": 0.177197265625, "learning_rate": 0.0001, "loss": 7.7969, "loss/crossentropy": 1.978526771068573, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.22661619633436203, "step": 3786 }, { "epoch": 0.23675, "grad_norm": 2.796875, "grad_norm_var": 0.17004292805989582, "learning_rate": 0.0001, "loss": 7.9381, "loss/crossentropy": 2.223105788230896, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.239663265645504, "step": 3788 }, { "epoch": 0.236875, "grad_norm": 3.34375, "grad_norm_var": 0.08036702473958333, "learning_rate": 0.0001, "loss": 8.1388, "loss/crossentropy": 2.765785574913025, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28563418984413147, "step": 3790 }, { "epoch": 0.237, "grad_norm": 3.171875, "grad_norm_var": 0.0760894775390625, "learning_rate": 0.0001, "loss": 7.8952, "loss/crossentropy": 1.8238465189933777, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.22626029700040817, "step": 3792 }, { "epoch": 0.237125, "grad_norm": 2.859375, "grad_norm_var": 0.07593485514322916, "learning_rate": 0.0001, "loss": 7.7314, "loss/crossentropy": 1.8743899464607239, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2275635451078415, "step": 3794 }, { "epoch": 0.23725, "grad_norm": 2.75, "grad_norm_var": 0.0802734375, "learning_rate": 0.0001, "loss": 7.9782, "loss/crossentropy": 2.0901085138320923, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2510174810886383, "step": 3796 }, { "epoch": 0.237375, "grad_norm": 2.90625, "grad_norm_var": 0.08465169270833334, "learning_rate": 0.0001, "loss": 7.975, "loss/crossentropy": 2.1743310689926147, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.25368860363960266, "step": 3798 }, { "epoch": 0.2375, "grad_norm": 2.8125, "grad_norm_var": 0.36526285807291664, "learning_rate": 0.0001, "loss": 8.1893, "loss/crossentropy": 2.2449333667755127, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24078501015901566, "step": 3800 }, { "epoch": 0.237625, "grad_norm": 2.6875, "grad_norm_var": 0.3767730712890625, "learning_rate": 0.0001, "loss": 7.9801, "loss/crossentropy": 2.5240964889526367, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25918126106262207, "step": 3802 }, { "epoch": 0.23775, "grad_norm": 9.5625, "grad_norm_var": 2.9724110921223956, "learning_rate": 0.0001, "loss": 7.9687, "loss/crossentropy": 2.2271899580955505, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24096133559942245, "step": 3804 }, { "epoch": 0.237875, "grad_norm": 2.90625, "grad_norm_var": 2.975516764322917, "learning_rate": 0.0001, "loss": 7.9853, "loss/crossentropy": 2.258672595024109, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.238336443901062, "step": 3806 }, { "epoch": 0.238, "grad_norm": 2.609375, "grad_norm_var": 3.018424479166667, "learning_rate": 0.0001, "loss": 7.5261, "loss/crossentropy": 2.014474391937256, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.22659683227539062, "step": 3808 }, { "epoch": 0.238125, "grad_norm": 2.75, "grad_norm_var": 3.0586334228515626, "learning_rate": 0.0001, "loss": 7.936, "loss/crossentropy": 2.326842784881592, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2781771123409271, "step": 3810 }, { "epoch": 0.23825, "grad_norm": 2.78125, "grad_norm_var": 3.049762980143229, "learning_rate": 0.0001, "loss": 7.9421, "loss/crossentropy": 2.150639057159424, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2547147274017334, "step": 3812 }, { "epoch": 0.238375, "grad_norm": 3.046875, "grad_norm_var": 3.0191365559895833, "learning_rate": 0.0001, "loss": 8.1857, "loss/crossentropy": 2.305691123008728, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2698453515768051, "step": 3814 }, { "epoch": 0.2385, "grad_norm": 2.703125, "grad_norm_var": 2.818309529622396, "learning_rate": 0.0001, "loss": 7.8953, "loss/crossentropy": 2.2862741947174072, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25044675171375275, "step": 3816 }, { "epoch": 0.238625, "grad_norm": 2.703125, "grad_norm_var": 2.83492431640625, "learning_rate": 0.0001, "loss": 7.5931, "loss/crossentropy": 2.0508742332458496, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2025298923254013, "step": 3818 }, { "epoch": 0.23875, "grad_norm": 2.9375, "grad_norm_var": 0.04814351399739583, "learning_rate": 0.0001, "loss": 8.0295, "loss/crossentropy": 2.624704599380493, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.22461646050214767, "step": 3820 }, { "epoch": 0.238875, "grad_norm": 2.8125, "grad_norm_var": 0.026595052083333334, "learning_rate": 0.0001, "loss": 7.8637, "loss/crossentropy": 2.2096092104911804, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24073514342308044, "step": 3822 }, { "epoch": 0.239, "grad_norm": 2.859375, "grad_norm_var": 0.02271728515625, "learning_rate": 0.0001, "loss": 7.993, "loss/crossentropy": 2.146873116493225, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2401731088757515, "step": 3824 }, { "epoch": 0.239125, "grad_norm": 2.6875, "grad_norm_var": 0.0234283447265625, "learning_rate": 0.0001, "loss": 7.8214, "loss/crossentropy": 2.140552520751953, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23471351712942123, "step": 3826 }, { "epoch": 0.23925, "grad_norm": 2.953125, "grad_norm_var": 0.023942057291666666, "learning_rate": 0.0001, "loss": 7.9997, "loss/crossentropy": 2.414936065673828, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.24187783896923065, "step": 3828 }, { "epoch": 0.239375, "grad_norm": 3.09375, "grad_norm_var": 0.025129191080729165, "learning_rate": 0.0001, "loss": 8.0156, "loss/crossentropy": 2.357997417449951, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.283820241689682, "step": 3830 }, { "epoch": 0.2395, "grad_norm": 3.25, "grad_norm_var": 0.03043212890625, "learning_rate": 0.0001, "loss": 8.2948, "loss/crossentropy": 2.3154983520507812, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26576242595911026, "step": 3832 }, { "epoch": 0.239625, "grad_norm": 3.015625, "grad_norm_var": 0.024274698893229165, "learning_rate": 0.0001, "loss": 8.0395, "loss/crossentropy": 2.24621844291687, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.25777043402194977, "step": 3834 }, { "epoch": 0.23975, "grad_norm": 2.796875, "grad_norm_var": 0.023631795247395834, "learning_rate": 0.0001, "loss": 7.7948, "loss/crossentropy": 2.0940075516700745, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2591153308749199, "step": 3836 }, { "epoch": 0.239875, "grad_norm": 3.125, "grad_norm_var": 0.025658162434895833, "learning_rate": 0.0001, "loss": 7.951, "loss/crossentropy": 1.96131432056427, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.24312470853328705, "step": 3838 }, { "epoch": 0.24, "grad_norm": 2.8125, "grad_norm_var": 0.026341756184895832, "learning_rate": 0.0001, "loss": 8.1305, "loss/crossentropy": 2.140554904937744, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27448374032974243, "step": 3840 }, { "epoch": 0.240125, "grad_norm": 2.828125, "grad_norm_var": 0.021996053059895833, "learning_rate": 0.0001, "loss": 7.7789, "loss/crossentropy": 2.332738757133484, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.239657923579216, "step": 3842 }, { "epoch": 0.24025, "grad_norm": 2.875, "grad_norm_var": 0.021549479166666666, "learning_rate": 0.0001, "loss": 8.0133, "loss/crossentropy": 2.0687233209609985, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24270765483379364, "step": 3844 }, { "epoch": 0.240375, "grad_norm": 3.046875, "grad_norm_var": 0.0222320556640625, "learning_rate": 0.0001, "loss": 8.1137, "loss/crossentropy": 2.4447437524795532, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24282505363225937, "step": 3846 }, { "epoch": 0.2405, "grad_norm": 3.125, "grad_norm_var": 0.022021484375, "learning_rate": 0.0001, "loss": 7.7143, "loss/crossentropy": 2.2180095911026, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24302168190479279, "step": 3848 }, { "epoch": 0.240625, "grad_norm": 3.0, "grad_norm_var": 0.026219685872395832, "learning_rate": 0.0001, "loss": 7.9856, "loss/crossentropy": 2.161308765411377, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.3072284013032913, "step": 3850 }, { "epoch": 0.24075, "grad_norm": 2.859375, "grad_norm_var": 0.0242340087890625, "learning_rate": 0.0001, "loss": 7.9395, "loss/crossentropy": 2.06203693151474, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.22315899282693863, "step": 3852 }, { "epoch": 0.240875, "grad_norm": 3.03125, "grad_norm_var": 0.021996053059895833, "learning_rate": 0.0001, "loss": 8.1296, "loss/crossentropy": 2.379991888999939, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2533378601074219, "step": 3854 }, { "epoch": 0.241, "grad_norm": 3.625, "grad_norm_var": 0.0501129150390625, "learning_rate": 0.0001, "loss": 7.9913, "loss/crossentropy": 2.316717028617859, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2669248729944229, "step": 3856 }, { "epoch": 0.241125, "grad_norm": 2.890625, "grad_norm_var": 0.04895426432291667, "learning_rate": 0.0001, "loss": 7.8078, "loss/crossentropy": 2.4782038927078247, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26327964663505554, "step": 3858 }, { "epoch": 0.24125, "grad_norm": 2.84375, "grad_norm_var": 0.05172526041666667, "learning_rate": 0.0001, "loss": 7.8827, "loss/crossentropy": 2.1494674682617188, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24628841876983643, "step": 3860 }, { "epoch": 0.241375, "grad_norm": 2.828125, "grad_norm_var": 0.055573527018229166, "learning_rate": 0.0001, "loss": 7.8361, "loss/crossentropy": 2.1763776540756226, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23965707421302795, "step": 3862 }, { "epoch": 0.2415, "grad_norm": 2.78125, "grad_norm_var": 0.07244364420572917, "learning_rate": 0.0001, "loss": 7.9438, "loss/crossentropy": 2.0851109623908997, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23258942365646362, "step": 3864 }, { "epoch": 0.241625, "grad_norm": 3.09375, "grad_norm_var": 0.071630859375, "learning_rate": 0.0001, "loss": 8.1806, "loss/crossentropy": 2.501349687576294, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2682061791419983, "step": 3866 }, { "epoch": 0.24175, "grad_norm": 2.828125, "grad_norm_var": 0.07084859212239583, "learning_rate": 0.0001, "loss": 7.7917, "loss/crossentropy": 2.0787243247032166, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.23858734220266342, "step": 3868 }, { "epoch": 0.241875, "grad_norm": 2.875, "grad_norm_var": 0.0725738525390625, "learning_rate": 0.0001, "loss": 8.058, "loss/crossentropy": 2.2865673303604126, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2657891660928726, "step": 3870 }, { "epoch": 0.242, "grad_norm": 2.9375, "grad_norm_var": 0.046187337239583334, "learning_rate": 0.0001, "loss": 7.7398, "loss/crossentropy": 2.2405368089675903, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.21967384219169617, "step": 3872 }, { "epoch": 0.242125, "grad_norm": 2.65625, "grad_norm_var": 0.048949178059895834, "learning_rate": 0.0001, "loss": 7.8247, "loss/crossentropy": 2.404029607772827, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2577320486307144, "step": 3874 }, { "epoch": 0.24225, "grad_norm": 2.796875, "grad_norm_var": 0.050145467122395836, "learning_rate": 0.0001, "loss": 7.7622, "loss/crossentropy": 2.000898540019989, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.22360271960496902, "step": 3876 }, { "epoch": 0.242375, "grad_norm": 2.6875, "grad_norm_var": 0.0509918212890625, "learning_rate": 0.0001, "loss": 8.0134, "loss/crossentropy": 2.0290806889533997, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2340826392173767, "step": 3878 }, { "epoch": 0.2425, "grad_norm": 3.296875, "grad_norm_var": 0.038736979166666664, "learning_rate": 0.0001, "loss": 7.5927, "loss/crossentropy": 2.03622567653656, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2297421544790268, "step": 3880 }, { "epoch": 0.242625, "grad_norm": 2.953125, "grad_norm_var": 0.035302734375, "learning_rate": 0.0001, "loss": 7.6075, "loss/crossentropy": 2.408776044845581, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2384292036294937, "step": 3882 }, { "epoch": 0.24275, "grad_norm": 2.8125, "grad_norm_var": 0.035374959309895836, "learning_rate": 0.0001, "loss": 8.1039, "loss/crossentropy": 2.3733338117599487, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2562837600708008, "step": 3884 }, { "epoch": 0.242875, "grad_norm": 3.34375, "grad_norm_var": 0.07336832682291666, "learning_rate": 0.0001, "loss": 8.1929, "loss/crossentropy": 2.340599298477173, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24032187461853027, "step": 3886 }, { "epoch": 0.243, "grad_norm": 2.515625, "grad_norm_var": 0.07639567057291667, "learning_rate": 0.0001, "loss": 7.785, "loss/crossentropy": 2.0766428112983704, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24173461645841599, "step": 3888 }, { "epoch": 0.243125, "grad_norm": 2.65625, "grad_norm_var": 0.07659505208333334, "learning_rate": 0.0001, "loss": 7.7176, "loss/crossentropy": 2.0216459035873413, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23061519861221313, "step": 3890 }, { "epoch": 0.24325, "grad_norm": 3.078125, "grad_norm_var": 0.07595113118489584, "learning_rate": 0.0001, "loss": 7.9577, "loss/crossentropy": 2.4382340908050537, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2533516585826874, "step": 3892 }, { "epoch": 0.243375, "grad_norm": 3.140625, "grad_norm_var": 0.07418212890625, "learning_rate": 0.0001, "loss": 8.0997, "loss/crossentropy": 2.247236132621765, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24737747013568878, "step": 3894 }, { "epoch": 0.2435, "grad_norm": 3.296875, "grad_norm_var": 0.0683502197265625, "learning_rate": 0.0001, "loss": 8.051, "loss/crossentropy": 2.2353633642196655, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2348858192563057, "step": 3896 }, { "epoch": 0.243625, "grad_norm": 2.765625, "grad_norm_var": 0.07394917805989583, "learning_rate": 0.0001, "loss": 7.9866, "loss/crossentropy": 2.2014153003692627, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2826214134693146, "step": 3898 }, { "epoch": 0.24375, "grad_norm": 2.890625, "grad_norm_var": 0.07604878743489583, "learning_rate": 0.0001, "loss": 7.9507, "loss/crossentropy": 2.4739164113998413, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.25748641788959503, "step": 3900 }, { "epoch": 0.243875, "grad_norm": 2.640625, "grad_norm_var": 0.04755859375, "learning_rate": 0.0001, "loss": 7.8795, "loss/crossentropy": 2.1323670148849487, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25650689005851746, "step": 3902 }, { "epoch": 0.244, "grad_norm": 2.734375, "grad_norm_var": 0.043701171875, "learning_rate": 0.0001, "loss": 7.8378, "loss/crossentropy": 2.3712302446365356, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2511042654514313, "step": 3904 }, { "epoch": 0.244125, "grad_norm": 2.984375, "grad_norm_var": 0.09410807291666666, "learning_rate": 0.0001, "loss": 8.3216, "loss/crossentropy": 2.4047751426696777, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2454528957605362, "step": 3906 }, { "epoch": 0.24425, "grad_norm": 2.875, "grad_norm_var": 0.09372456868489583, "learning_rate": 0.0001, "loss": 7.8043, "loss/crossentropy": 2.0897953510284424, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.23917565494775772, "step": 3908 }, { "epoch": 0.244375, "grad_norm": 2.84375, "grad_norm_var": 0.09542643229166667, "learning_rate": 0.0001, "loss": 7.8158, "loss/crossentropy": 2.3932164907455444, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24167190492153168, "step": 3910 }, { "epoch": 0.2445, "grad_norm": 2.6875, "grad_norm_var": 0.09932352701822916, "learning_rate": 0.0001, "loss": 8.0192, "loss/crossentropy": 2.150011718273163, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23026306927204132, "step": 3912 }, { "epoch": 0.244625, "grad_norm": 2.8125, "grad_norm_var": 0.10123291015625, "learning_rate": 0.0001, "loss": 8.0848, "loss/crossentropy": 2.2929705381393433, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.24725070595741272, "step": 3914 }, { "epoch": 0.24475, "grad_norm": 2.78125, "grad_norm_var": 0.1007476806640625, "learning_rate": 0.0001, "loss": 8.1725, "loss/crossentropy": 2.7538572549819946, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26145463436841965, "step": 3916 }, { "epoch": 0.244875, "grad_norm": 2.828125, "grad_norm_var": 0.0963531494140625, "learning_rate": 0.0001, "loss": 8.1, "loss/crossentropy": 2.3769789934158325, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24588000029325485, "step": 3918 }, { "epoch": 0.245, "grad_norm": 2.828125, "grad_norm_var": 0.09029032389322916, "learning_rate": 0.0001, "loss": 7.6699, "loss/crossentropy": 1.9251909255981445, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25139550119638443, "step": 3920 }, { "epoch": 0.245125, "grad_norm": 2.609375, "grad_norm_var": 0.03618062337239583, "learning_rate": 0.0001, "loss": 7.8213, "loss/crossentropy": 2.192555010318756, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25434186309576035, "step": 3922 }, { "epoch": 0.24525, "grad_norm": 2.734375, "grad_norm_var": 0.03730061848958333, "learning_rate": 0.0001, "loss": 7.7314, "loss/crossentropy": 2.180566668510437, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2401362881064415, "step": 3924 }, { "epoch": 0.245375, "grad_norm": 2.6875, "grad_norm_var": 0.04153238932291667, "learning_rate": 0.0001, "loss": 7.7933, "loss/crossentropy": 2.3428491353988647, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.22811010479927063, "step": 3926 }, { "epoch": 0.2455, "grad_norm": 2.953125, "grad_norm_var": 0.3216461181640625, "learning_rate": 0.0001, "loss": 8.0985, "loss/crossentropy": 2.136742353439331, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27370651066303253, "step": 3928 }, { "epoch": 0.245625, "grad_norm": 6.09375, "grad_norm_var": 6.679329427083333, "learning_rate": 0.0001, "loss": 8.0496, "loss/crossentropy": 2.194254517555237, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2502598837018013, "step": 3930 }, { "epoch": 0.24575, "grad_norm": 2.71875, "grad_norm_var": 6.666551717122396, "learning_rate": 0.0001, "loss": 7.7119, "loss/crossentropy": 2.229232430458069, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.22939639538526535, "step": 3932 }, { "epoch": 0.245875, "grad_norm": 3.109375, "grad_norm_var": 6.66904296875, "learning_rate": 0.0001, "loss": 7.7796, "loss/crossentropy": 2.297219753265381, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.23499147593975067, "step": 3934 }, { "epoch": 0.246, "grad_norm": 3.046875, "grad_norm_var": 6.655793253580729, "learning_rate": 0.0001, "loss": 7.5568, "loss/crossentropy": 1.9980588555335999, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.20121245831251144, "step": 3936 }, { "epoch": 0.246125, "grad_norm": 3.21875, "grad_norm_var": 6.571418253580729, "learning_rate": 0.0001, "loss": 7.8851, "loss/crossentropy": 2.2334850430488586, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24031591415405273, "step": 3938 }, { "epoch": 0.24625, "grad_norm": 2.640625, "grad_norm_var": 6.60816650390625, "learning_rate": 0.0001, "loss": 7.8051, "loss/crossentropy": 2.151924788951874, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24223484098911285, "step": 3940 }, { "epoch": 0.246375, "grad_norm": 2.953125, "grad_norm_var": 6.49404296875, "learning_rate": 0.0001, "loss": 7.9787, "loss/crossentropy": 1.9760611057281494, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2687162607908249, "step": 3942 }, { "epoch": 0.2465, "grad_norm": 3.375, "grad_norm_var": 6.415360514322916, "learning_rate": 0.0001, "loss": 7.8625, "loss/crossentropy": 2.1624463200569153, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2566223293542862, "step": 3944 }, { "epoch": 0.246625, "grad_norm": 2.6875, "grad_norm_var": 0.05237223307291667, "learning_rate": 0.0001, "loss": 7.8567, "loss/crossentropy": 2.005824327468872, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.22724048048257828, "step": 3946 }, { "epoch": 0.24675, "grad_norm": 3.125, "grad_norm_var": 0.05159403483072917, "learning_rate": 0.0001, "loss": 8.3117, "loss/crossentropy": 2.373980760574341, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.22651521116495132, "step": 3948 }, { "epoch": 0.246875, "grad_norm": 2.921875, "grad_norm_var": 0.06736551920572917, "learning_rate": 0.0001, "loss": 7.8482, "loss/crossentropy": 2.2012929916381836, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25912199914455414, "step": 3950 }, { "epoch": 0.247, "grad_norm": 2.734375, "grad_norm_var": 0.0662750244140625, "learning_rate": 0.0001, "loss": 7.9075, "loss/crossentropy": 2.2046212553977966, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2533003240823746, "step": 3952 }, { "epoch": 0.247125, "grad_norm": 2.71875, "grad_norm_var": 0.07158101399739583, "learning_rate": 0.0001, "loss": 7.9504, "loss/crossentropy": 2.2848947048187256, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24908190220594406, "step": 3954 }, { "epoch": 0.24725, "grad_norm": 2.90625, "grad_norm_var": 0.06037495930989583, "learning_rate": 0.0001, "loss": 7.9475, "loss/crossentropy": 2.413972496986389, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2672061175107956, "step": 3956 }, { "epoch": 0.247375, "grad_norm": 3.046875, "grad_norm_var": 0.0574859619140625, "learning_rate": 0.0001, "loss": 8.1567, "loss/crossentropy": 2.096158504486084, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2615414559841156, "step": 3958 }, { "epoch": 0.2475, "grad_norm": 2.859375, "grad_norm_var": 0.048421223958333336, "learning_rate": 0.0001, "loss": 7.7317, "loss/crossentropy": 2.0660911202430725, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2472071498632431, "step": 3960 }, { "epoch": 0.247625, "grad_norm": 2.875, "grad_norm_var": 0.04641927083333333, "learning_rate": 0.0001, "loss": 7.8344, "loss/crossentropy": 2.0953343510627747, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.23046907782554626, "step": 3962 }, { "epoch": 0.24775, "grad_norm": 2.59375, "grad_norm_var": 0.049609375, "learning_rate": 0.0001, "loss": 7.7008, "loss/crossentropy": 1.82452791929245, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.24055629968643188, "step": 3964 }, { "epoch": 0.247875, "grad_norm": 3.09375, "grad_norm_var": 0.024735514322916666, "learning_rate": 0.0001, "loss": 7.8135, "loss/crossentropy": 2.0658547282218933, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2628151923418045, "step": 3966 }, { "epoch": 0.248, "grad_norm": 2.765625, "grad_norm_var": 0.024312337239583332, "learning_rate": 0.0001, "loss": 8.2018, "loss/crossentropy": 2.480798840522766, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2662227302789688, "step": 3968 }, { "epoch": 0.248125, "grad_norm": 3.1875, "grad_norm_var": 0.027197265625, "learning_rate": 0.0001, "loss": 7.8781, "loss/crossentropy": 2.0047097206115723, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.22677994519472122, "step": 3970 }, { "epoch": 0.24825, "grad_norm": 2.796875, "grad_norm_var": 0.028120930989583334, "learning_rate": 0.0001, "loss": 7.8491, "loss/crossentropy": 2.277346134185791, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2579154819250107, "step": 3972 }, { "epoch": 0.248375, "grad_norm": 2.75, "grad_norm_var": 0.0278717041015625, "learning_rate": 0.0001, "loss": 7.8388, "loss/crossentropy": 2.085721015930176, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2601402476429939, "step": 3974 }, { "epoch": 0.2485, "grad_norm": 3.1875, "grad_norm_var": 0.03560791015625, "learning_rate": 0.0001, "loss": 7.8489, "loss/crossentropy": 2.1430622339248657, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26219113171100616, "step": 3976 }, { "epoch": 0.248625, "grad_norm": 2.75, "grad_norm_var": 0.03760477701822917, "learning_rate": 0.0001, "loss": 7.8513, "loss/crossentropy": 2.0896475315093994, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25778842717409134, "step": 3978 }, { "epoch": 0.24875, "grad_norm": 3.5, "grad_norm_var": 0.07751363118489583, "learning_rate": 0.0001, "loss": 8.0859, "loss/crossentropy": 2.225816011428833, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2742011398077011, "step": 3980 }, { "epoch": 0.248875, "grad_norm": 3.5, "grad_norm_var": 0.0935943603515625, "learning_rate": 0.0001, "loss": 8.1728, "loss/crossentropy": 2.2876285314559937, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2769860327243805, "step": 3982 }, { "epoch": 0.249, "grad_norm": 3.15625, "grad_norm_var": 0.3271555582682292, "learning_rate": 0.0001, "loss": 7.7188, "loss/crossentropy": 2.0827596187591553, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.22882172465324402, "step": 3984 }, { "epoch": 0.249125, "grad_norm": 2.609375, "grad_norm_var": 0.3521484375, "learning_rate": 0.0001, "loss": 7.823, "loss/crossentropy": 1.9287652969360352, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23859848082065582, "step": 3986 }, { "epoch": 0.24925, "grad_norm": 3.09375, "grad_norm_var": 0.35139058430989584, "learning_rate": 0.0001, "loss": 8.0426, "loss/crossentropy": 2.221052050590515, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2629384398460388, "step": 3988 }, { "epoch": 0.249375, "grad_norm": 2.796875, "grad_norm_var": 0.33687744140625, "learning_rate": 0.0001, "loss": 7.7937, "loss/crossentropy": 2.2589504718780518, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2325127273797989, "step": 3990 }, { "epoch": 0.2495, "grad_norm": 2.859375, "grad_norm_var": 0.35234375, "learning_rate": 0.0001, "loss": 7.7682, "loss/crossentropy": 2.1742732524871826, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.22974377870559692, "step": 3992 }, { "epoch": 0.249625, "grad_norm": 4.375, "grad_norm_var": 0.42823893229166665, "learning_rate": 0.0001, "loss": 7.7779, "loss/crossentropy": 2.5104219913482666, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2467298060655594, "step": 3994 }, { "epoch": 0.24975, "grad_norm": 2.9375, "grad_norm_var": 0.420947265625, "learning_rate": 0.0001, "loss": 7.7312, "loss/crossentropy": 2.1976277232170105, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24208973348140717, "step": 3996 }, { "epoch": 0.249875, "grad_norm": 2.828125, "grad_norm_var": 0.418994140625, "learning_rate": 0.0001, "loss": 7.7208, "loss/crossentropy": 2.2309181094169617, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2361968457698822, "step": 3998 }, { "epoch": 0.25, "grad_norm": 2.78125, "grad_norm_var": 0.16116434733072918, "learning_rate": 0.0001, "loss": 7.5968, "loss/crossentropy": 1.8574098348617554, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.19984108209609985, "step": 4000 } ], "logging_steps": 2, "max_steps": 16000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.16590621310976e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }