{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 2000, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4e-05, "grad_norm": 456.0, "learning_rate": 1.18e-05, "loss": 85.4554, "loss/crossentropy": 9.650346755981445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 8.066818237304688, "step": 2 }, { "epoch": 8e-05, "grad_norm": 416.0, "learning_rate": 1.3600000000000002e-05, "loss": 84.3418, "loss/crossentropy": 9.544375896453857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 7.628942489624023, "step": 4 }, { "epoch": 0.00012, "grad_norm": 466.0, "learning_rate": 1.54e-05, "loss": 87.2187, "loss/crossentropy": 9.569977283477783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 7.7909746170043945, "step": 6 }, { "epoch": 0.00016, "grad_norm": 247.0, "learning_rate": 1.72e-05, "loss": 82.5078, "loss/crossentropy": 9.06786823272705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 7.3673131465911865, "step": 8 }, { "epoch": 0.0002, "grad_norm": 179.0, "learning_rate": 1.9e-05, "loss": 78.2757, "loss/crossentropy": 8.918366432189941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.912693023681641, "step": 10 }, { "epoch": 0.00024, "grad_norm": 148.0, "learning_rate": 2.0800000000000004e-05, "loss": 74.4248, "loss/crossentropy": 8.443636417388916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.567321538925171, "step": 12 }, { "epoch": 0.00028, "grad_norm": 131.0, "learning_rate": 2.2600000000000004e-05, "loss": 73.0003, "loss/crossentropy": 8.428278923034668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.706400156021118, "step": 14 }, { "epoch": 0.00032, "grad_norm": 181.0, "grad_norm_var": 16279.8625, "learning_rate": 2.4400000000000004e-05, "loss": 70.0047, "loss/crossentropy": 8.216889381408691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.056080102920532, "step": 16 }, { "epoch": 0.00036, "grad_norm": 90.5, "grad_norm_var": 14154.148958333333, "learning_rate": 2.6200000000000003e-05, "loss": 69.9766, "loss/crossentropy": 8.191599607467651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.429446697235107, "step": 18 }, { "epoch": 0.0004, "grad_norm": 52.25, "grad_norm_var": 12194.27890625, "learning_rate": 2.8000000000000003e-05, "loss": 64.3807, "loss/crossentropy": 7.506032228469849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 5.794633388519287, "step": 20 }, { "epoch": 0.00044, "grad_norm": 39.25, "grad_norm_var": 6249.4875, "learning_rate": 2.9800000000000006e-05, "loss": 61.2802, "loss/crossentropy": 7.152851343154907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 5.261489152908325, "step": 22 }, { "epoch": 0.00048, "grad_norm": 57.0, "grad_norm_var": 4626.8875, "learning_rate": 3.16e-05, "loss": 58.3454, "loss/crossentropy": 6.956738471984863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 5.020895004272461, "step": 24 }, { "epoch": 0.00052, "grad_norm": 86.0, "grad_norm_var": 4244.565625, "learning_rate": 3.3400000000000005e-05, "loss": 54.2703, "loss/crossentropy": 6.686542987823486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 4.801911354064941, "step": 26 }, { "epoch": 0.00056, "grad_norm": 110.5, "grad_norm_var": 3868.5875, "learning_rate": 3.520000000000001e-05, "loss": 51.7343, "loss/crossentropy": 6.4867262840271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 4.5746169090271, "step": 28 }, { "epoch": 0.0006, "grad_norm": 50.0, "grad_norm_var": 3953.82890625, "learning_rate": 3.7e-05, "loss": 49.6807, "loss/crossentropy": 6.364065408706665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 4.230688810348511, "step": 30 }, { "epoch": 0.00064, "grad_norm": 68.0, "grad_norm_var": 3157.4958333333334, "learning_rate": 3.88e-05, "loss": 44.7112, "loss/crossentropy": 5.731794834136963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 3.6969637870788574, "step": 32 }, { "epoch": 0.00068, "grad_norm": 50.75, "grad_norm_var": 500.59973958333336, "learning_rate": 4.0600000000000004e-05, "loss": 42.2923, "loss/crossentropy": 5.553718328475952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 3.7271878719329834, "step": 34 }, { "epoch": 0.00072, "grad_norm": 55.5, "grad_norm_var": 342.2122395833333, "learning_rate": 4.240000000000001e-05, "loss": 37.7465, "loss/crossentropy": 5.023651361465454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 3.2670111656188965, "step": 36 }, { "epoch": 0.00076, "grad_norm": 75.5, "grad_norm_var": 283.37395833333335, "learning_rate": 4.420000000000001e-05, "loss": 35.1313, "loss/crossentropy": 4.921839237213135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.950661063194275, "step": 38 }, { "epoch": 0.0008, "grad_norm": 44.5, "grad_norm_var": 299.1958333333333, "learning_rate": 4.600000000000001e-05, "loss": 32.3316, "loss/crossentropy": 4.782621145248413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.7473502159118652, "step": 40 }, { "epoch": 0.00084, "grad_norm": 36.5, "grad_norm_var": 361.1372395833333, "learning_rate": 4.78e-05, "loss": 28.4104, "loss/crossentropy": 3.8754972219467163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.410745143890381, "step": 42 }, { "epoch": 0.00088, "grad_norm": 36.25, "grad_norm_var": 225.80598958333334, "learning_rate": 4.96e-05, "loss": 26.1806, "loss/crossentropy": 3.9885865449905396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.1414425373077393, "step": 44 }, { "epoch": 0.00092, "grad_norm": 46.0, "grad_norm_var": 249.475, "learning_rate": 5.14e-05, "loss": 24.4012, "loss/crossentropy": 3.750515580177307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.063339352607727, "step": 46 }, { "epoch": 0.00096, "grad_norm": 20.875, "grad_norm_var": 315.8291015625, "learning_rate": 5.3200000000000006e-05, "loss": 22.822, "loss/crossentropy": 3.7912577390670776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.9196046590805054, "step": 48 }, { "epoch": 0.001, "grad_norm": 35.5, "grad_norm_var": 279.284375, "learning_rate": 5.500000000000001e-05, "loss": 21.036, "loss/crossentropy": 3.777758002281189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.7479944229125977, "step": 50 }, { "epoch": 0.00104, "grad_norm": 20.75, "grad_norm_var": 306.15149739583336, "learning_rate": 5.680000000000001e-05, "loss": 20.3608, "loss/crossentropy": 3.5903185606002808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.5533717274665833, "step": 52 }, { "epoch": 0.00108, "grad_norm": 43.75, "grad_norm_var": 213.07395833333334, "learning_rate": 5.860000000000001e-05, "loss": 18.813, "loss/crossentropy": 3.691780686378479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.4755533933639526, "step": 54 }, { "epoch": 0.00112, "grad_norm": 21.25, "grad_norm_var": 70.690625, "learning_rate": 6.040000000000001e-05, "loss": 19.1421, "loss/crossentropy": 3.557003617286682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.5198156833648682, "step": 56 }, { "epoch": 0.00116, "grad_norm": 21.5, "grad_norm_var": 76.30390625, "learning_rate": 6.220000000000001e-05, "loss": 17.2705, "loss/crossentropy": 3.2730292081832886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.4131997227668762, "step": 58 }, { "epoch": 0.0012, "grad_norm": 19.875, "grad_norm_var": 77.11399739583334, "learning_rate": 6.400000000000001e-05, "loss": 16.4712, "loss/crossentropy": 3.419156074523926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.3277101516723633, "step": 60 }, { "epoch": 0.00124, "grad_norm": 25.75, "grad_norm_var": 48.60520833333333, "learning_rate": 6.58e-05, "loss": 16.6219, "loss/crossentropy": 2.973878502845764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.3438007831573486, "step": 62 }, { "epoch": 0.00128, "grad_norm": 34.5, "grad_norm_var": 53.18020833333333, "learning_rate": 6.76e-05, "loss": 15.0929, "loss/crossentropy": 2.892021059989929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.1624282002449036, "step": 64 }, { "epoch": 0.00132, "grad_norm": 15.4375, "grad_norm_var": 51.195947265625, "learning_rate": 6.94e-05, "loss": 15.1967, "loss/crossentropy": 2.954660177230835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.116301715373993, "step": 66 }, { "epoch": 0.00136, "grad_norm": 32.0, "grad_norm_var": 51.064306640625, "learning_rate": 7.120000000000001e-05, "loss": 14.9397, "loss/crossentropy": 3.2686156034469604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.1978037357330322, "step": 68 }, { "epoch": 0.0014, "grad_norm": 29.25, "grad_norm_var": 32.794270833333336, "learning_rate": 7.3e-05, "loss": 14.4846, "loss/crossentropy": 2.7956581115722656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.1973016262054443, "step": 70 }, { "epoch": 0.00144, "grad_norm": 15.25, "grad_norm_var": 37.80149739583333, "learning_rate": 7.48e-05, "loss": 14.1296, "loss/crossentropy": 3.08966863155365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.151496708393097, "step": 72 }, { "epoch": 0.00148, "grad_norm": 18.625, "grad_norm_var": 41.88984375, "learning_rate": 7.66e-05, "loss": 13.6812, "loss/crossentropy": 2.949987292289734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9787414371967316, "step": 74 }, { "epoch": 0.00152, "grad_norm": 15.1875, "grad_norm_var": 49.143489583333334, "learning_rate": 7.840000000000001e-05, "loss": 12.8901, "loss/crossentropy": 3.1161292791366577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.0640113949775696, "step": 76 }, { "epoch": 0.00156, "grad_norm": 22.125, "grad_norm_var": 47.63170572916667, "learning_rate": 8.020000000000001e-05, "loss": 13.157, "loss/crossentropy": 3.3661664724349976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.1353825330734253, "step": 78 }, { "epoch": 0.0016, "grad_norm": 16.25, "grad_norm_var": 35.33274739583333, "learning_rate": 8.200000000000001e-05, "loss": 12.9372, "loss/crossentropy": 2.927241563796997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9984134435653687, "step": 80 }, { "epoch": 0.00164, "grad_norm": 13.625, "grad_norm_var": 37.53951822916667, "learning_rate": 8.38e-05, "loss": 12.0477, "loss/crossentropy": 3.1273285150527954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9241160154342651, "step": 82 }, { "epoch": 0.00168, "grad_norm": 19.125, "grad_norm_var": 19.602718098958334, "learning_rate": 8.560000000000001e-05, "loss": 11.9084, "loss/crossentropy": 2.737278938293457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8822851181030273, "step": 84 }, { "epoch": 0.00172, "grad_norm": 13.0625, "grad_norm_var": 11.107014973958334, "learning_rate": 8.740000000000001e-05, "loss": 11.8594, "loss/crossentropy": 2.4452388286590576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8693483769893646, "step": 86 }, { "epoch": 0.00176, "grad_norm": 18.375, "grad_norm_var": 11.328369140625, "learning_rate": 8.92e-05, "loss": 11.5058, "loss/crossentropy": 2.89771831035614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8749278783798218, "step": 88 }, { "epoch": 0.0018, "grad_norm": 12.1875, "grad_norm_var": 13.811572265625, "learning_rate": 9.1e-05, "loss": 11.9281, "loss/crossentropy": 3.0173208713531494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8714744746685028, "step": 90 }, { "epoch": 0.00184, "grad_norm": 20.375, "grad_norm_var": 16.114306640625, "learning_rate": 9.28e-05, "loss": 11.4244, "loss/crossentropy": 2.588515043258667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9651442170143127, "step": 92 }, { "epoch": 0.00188, "grad_norm": 13.375, "grad_norm_var": 12.784309895833333, "learning_rate": 9.46e-05, "loss": 11.6092, "loss/crossentropy": 2.8116774559020996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9143906235694885, "step": 94 }, { "epoch": 0.00192, "grad_norm": 16.375, "grad_norm_var": 13.225764973958333, "learning_rate": 9.64e-05, "loss": 10.6723, "loss/crossentropy": 2.8734441995620728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.814399927854538, "step": 96 }, { "epoch": 0.00196, "grad_norm": 10.3125, "grad_norm_var": 17.228238932291667, "learning_rate": 9.82e-05, "loss": 10.6577, "loss/crossentropy": 2.664194703102112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7905783653259277, "step": 98 }, { "epoch": 0.002, "grad_norm": 12.3125, "grad_norm_var": 17.113785807291666, "learning_rate": 0.0001, "loss": 10.6847, "loss/crossentropy": 2.4851003885269165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.769305944442749, "step": 100 }, { "epoch": 0.00204, "grad_norm": 10.75, "grad_norm_var": 13.9140625, "learning_rate": 0.0001, "loss": 10.8119, "loss/crossentropy": 2.2757182121276855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8241511881351471, "step": 102 }, { "epoch": 0.00208, "grad_norm": 11.375, "grad_norm_var": 13.615738932291666, "learning_rate": 0.0001, "loss": 10.6244, "loss/crossentropy": 2.7211785316467285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8518709540367126, "step": 104 }, { "epoch": 0.00212, "grad_norm": 11.625, "grad_norm_var": 14.163395182291667, "learning_rate": 0.0001, "loss": 10.5629, "loss/crossentropy": 2.387019991874695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7568954229354858, "step": 106 }, { "epoch": 0.00216, "grad_norm": 9.9375, "grad_norm_var": 10.788525390625, "learning_rate": 0.0001, "loss": 10.1364, "loss/crossentropy": 2.5363346338272095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7648341059684753, "step": 108 }, { "epoch": 0.0022, "grad_norm": 20.0, "grad_norm_var": 14.898893229166667, "learning_rate": 0.0001, "loss": 10.8773, "loss/crossentropy": 2.8450236320495605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.737193763256073, "step": 110 }, { "epoch": 0.00224, "grad_norm": 10.25, "grad_norm_var": 15.364518229166666, "learning_rate": 0.0001, "loss": 9.4554, "loss/crossentropy": 2.4827451705932617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6924614012241364, "step": 112 }, { "epoch": 0.00228, "grad_norm": 10.5625, "grad_norm_var": 7.461832682291667, "learning_rate": 0.0001, "loss": 9.9651, "loss/crossentropy": 2.093318462371826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7038464546203613, "step": 114 }, { "epoch": 0.00232, "grad_norm": 9.6875, "grad_norm_var": 7.597330729166667, "learning_rate": 0.0001, "loss": 10.0297, "loss/crossentropy": 2.5149790048599243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6980189085006714, "step": 116 }, { "epoch": 0.00236, "grad_norm": 11.625, "grad_norm_var": 6.672509765625, "learning_rate": 0.0001, "loss": 9.8176, "loss/crossentropy": 2.6022276878356934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7005251348018646, "step": 118 }, { "epoch": 0.0024, "grad_norm": 7.625, "grad_norm_var": 7.298160807291667, "learning_rate": 0.0001, "loss": 9.5658, "loss/crossentropy": 2.6836462020874023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7237774729728699, "step": 120 }, { "epoch": 0.00244, "grad_norm": 9.5625, "grad_norm_var": 7.402018229166667, "learning_rate": 0.0001, "loss": 9.7376, "loss/crossentropy": 2.6823805570602417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7570162117481232, "step": 122 }, { "epoch": 0.00248, "grad_norm": 11.25, "grad_norm_var": 7.391259765625, "learning_rate": 0.0001, "loss": 9.4713, "loss/crossentropy": 2.6233514547348022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7183247208595276, "step": 124 }, { "epoch": 0.00252, "grad_norm": 9.9375, "grad_norm_var": 1.0839680989583333, "learning_rate": 0.0001, "loss": 9.2243, "loss/crossentropy": 2.331676959991455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6982125043869019, "step": 126 }, { "epoch": 0.00256, "grad_norm": 9.1875, "grad_norm_var": 0.9687337239583333, "learning_rate": 0.0001, "loss": 9.4777, "loss/crossentropy": 2.429046392440796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.696417510509491, "step": 128 }, { "epoch": 0.0026, "grad_norm": 14.8125, "grad_norm_var": 2.448291015625, "learning_rate": 0.0001, "loss": 9.9024, "loss/crossentropy": 2.5262571573257446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.716008871793747, "step": 130 }, { "epoch": 0.00264, "grad_norm": 8.8125, "grad_norm_var": 2.7044270833333335, "learning_rate": 0.0001, "loss": 9.2836, "loss/crossentropy": 2.187526524066925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.666053056716919, "step": 132 }, { "epoch": 0.00268, "grad_norm": 8.9375, "grad_norm_var": 3.285270182291667, "learning_rate": 0.0001, "loss": 9.8338, "loss/crossentropy": 2.4199057817459106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6709816455841064, "step": 134 }, { "epoch": 0.00272, "grad_norm": 9.6875, "grad_norm_var": 3.4072916666666666, "learning_rate": 0.0001, "loss": 9.4225, "loss/crossentropy": 2.1963008642196655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5963725447654724, "step": 136 }, { "epoch": 0.00276, "grad_norm": 8.0625, "grad_norm_var": 3.556884765625, "learning_rate": 0.0001, "loss": 9.44, "loss/crossentropy": 2.5878132581710815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6362220048904419, "step": 138 }, { "epoch": 0.0028, "grad_norm": 9.25, "grad_norm_var": 3.4852701822916665, "learning_rate": 0.0001, "loss": 9.4314, "loss/crossentropy": 2.7800480127334595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6819457113742828, "step": 140 }, { "epoch": 0.00284, "grad_norm": 8.4375, "grad_norm_var": 3.838997395833333, "learning_rate": 0.0001, "loss": 9.1047, "loss/crossentropy": 2.5055110454559326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.674308180809021, "step": 142 }, { "epoch": 0.00288, "grad_norm": 8.25, "grad_norm_var": 4.078499348958333, "learning_rate": 0.0001, "loss": 9.1578, "loss/crossentropy": 2.8532944917678833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7083481848239899, "step": 144 }, { "epoch": 0.00292, "grad_norm": 7.8125, "grad_norm_var": 2.2759765625, "learning_rate": 0.0001, "loss": 8.8023, "loss/crossentropy": 2.442527174949646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6510869562625885, "step": 146 }, { "epoch": 0.00296, "grad_norm": 10.9375, "grad_norm_var": 2.8544881184895834, "learning_rate": 0.0001, "loss": 8.7238, "loss/crossentropy": 2.516597867012024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6362285614013672, "step": 148 }, { "epoch": 0.003, "grad_norm": 7.40625, "grad_norm_var": 1.8615885416666667, "learning_rate": 0.0001, "loss": 8.5543, "loss/crossentropy": 2.8672900199890137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6459421515464783, "step": 150 }, { "epoch": 0.00304, "grad_norm": 8.1875, "grad_norm_var": 1.7195963541666666, "learning_rate": 0.0001, "loss": 8.6403, "loss/crossentropy": 2.2042795419692993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5720505118370056, "step": 152 }, { "epoch": 0.00308, "grad_norm": 7.8125, "grad_norm_var": 1.876025390625, "learning_rate": 0.0001, "loss": 8.768, "loss/crossentropy": 2.225563883781433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6328675150871277, "step": 154 }, { "epoch": 0.00312, "grad_norm": 8.0625, "grad_norm_var": 1.5925618489583333, "learning_rate": 0.0001, "loss": 8.5743, "loss/crossentropy": 2.3541462421417236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5927431881427765, "step": 156 }, { "epoch": 0.00316, "grad_norm": 7.34375, "grad_norm_var": 1.834619140625, "learning_rate": 0.0001, "loss": 8.7329, "loss/crossentropy": 2.4685616493225098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6299368739128113, "step": 158 }, { "epoch": 0.0032, "grad_norm": 9.25, "grad_norm_var": 1.4429646809895833, "learning_rate": 0.0001, "loss": 8.4796, "loss/crossentropy": 2.4637919664382935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6084515154361725, "step": 160 }, { "epoch": 0.00324, "grad_norm": 9.0625, "grad_norm_var": 1.3692545572916666, "learning_rate": 0.0001, "loss": 9.0446, "loss/crossentropy": 2.598397374153137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.574739396572113, "step": 162 }, { "epoch": 0.00328, "grad_norm": 7.15625, "grad_norm_var": 0.9171834309895833, "learning_rate": 0.0001, "loss": 8.1508, "loss/crossentropy": 2.5183030366897583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5985860526561737, "step": 164 }, { "epoch": 0.00332, "grad_norm": 9.125, "grad_norm_var": 0.9571614583333333, "learning_rate": 0.0001, "loss": 8.4296, "loss/crossentropy": 2.252183437347412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5627379417419434, "step": 166 }, { "epoch": 0.00336, "grad_norm": 7.875, "grad_norm_var": 0.70533447265625, "learning_rate": 0.0001, "loss": 8.4549, "loss/crossentropy": 2.5720516443252563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5945309698581696, "step": 168 }, { "epoch": 0.0034, "grad_norm": 9.1875, "grad_norm_var": 0.8844034830729167, "learning_rate": 0.0001, "loss": 8.6096, "loss/crossentropy": 2.3004332184791565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5401738286018372, "step": 170 }, { "epoch": 0.00344, "grad_norm": 7.71875, "grad_norm_var": 0.948681640625, "learning_rate": 0.0001, "loss": 8.5484, "loss/crossentropy": 2.689734935760498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6384358406066895, "step": 172 }, { "epoch": 0.00348, "grad_norm": 6.84375, "grad_norm_var": 0.9418253580729167, "learning_rate": 0.0001, "loss": 8.1888, "loss/crossentropy": 1.944397747516632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5828913450241089, "step": 174 }, { "epoch": 0.00352, "grad_norm": 6.53125, "grad_norm_var": 0.8817342122395834, "learning_rate": 0.0001, "loss": 8.0577, "loss/crossentropy": 2.8166507482528687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5916908979415894, "step": 176 }, { "epoch": 0.00356, "grad_norm": 7.09375, "grad_norm_var": 0.7456990559895833, "learning_rate": 0.0001, "loss": 8.7701, "loss/crossentropy": 2.3208402395248413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5968939661979675, "step": 178 }, { "epoch": 0.0036, "grad_norm": 7.25, "grad_norm_var": 0.7425130208333334, "learning_rate": 0.0001, "loss": 8.2615, "loss/crossentropy": 2.817763566970825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5800873041152954, "step": 180 }, { "epoch": 0.00364, "grad_norm": 6.78125, "grad_norm_var": 0.6126912434895834, "learning_rate": 0.0001, "loss": 8.3053, "loss/crossentropy": 2.250023365020752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5279016494750977, "step": 182 }, { "epoch": 0.00368, "grad_norm": 6.09375, "grad_norm_var": 0.6917805989583333, "learning_rate": 0.0001, "loss": 7.7974, "loss/crossentropy": 2.1337096095085144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5393811166286469, "step": 184 }, { "epoch": 0.00372, "grad_norm": 6.34375, "grad_norm_var": 0.5962198893229167, "learning_rate": 0.0001, "loss": 7.7258, "loss/crossentropy": 2.6338934898376465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6444519460201263, "step": 186 }, { "epoch": 0.00376, "grad_norm": 10.4375, "grad_norm_var": 1.2786458333333333, "learning_rate": 0.0001, "loss": 8.0762, "loss/crossentropy": 2.66677463054657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6012931764125824, "step": 188 }, { "epoch": 0.0038, "grad_norm": 6.375, "grad_norm_var": 1.6642537434895834, "learning_rate": 0.0001, "loss": 8.3177, "loss/crossentropy": 2.3731196522712708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6299596428871155, "step": 190 }, { "epoch": 0.00384, "grad_norm": 7.4375, "grad_norm_var": 1.7719889322916667, "learning_rate": 0.0001, "loss": 8.214, "loss/crossentropy": 2.411492705345154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5328834354877472, "step": 192 }, { "epoch": 0.00388, "grad_norm": 7.15625, "grad_norm_var": 1.9489420572916667, "learning_rate": 0.0001, "loss": 7.9763, "loss/crossentropy": 2.2402734756469727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.539243072271347, "step": 194 }, { "epoch": 0.00392, "grad_norm": 7.53125, "grad_norm_var": 1.895556640625, "learning_rate": 0.0001, "loss": 7.9292, "loss/crossentropy": 2.3250681161880493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5374342203140259, "step": 196 }, { "epoch": 0.00396, "grad_norm": 7.75, "grad_norm_var": 1.8979451497395834, "learning_rate": 0.0001, "loss": 7.9201, "loss/crossentropy": 2.42138135433197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5410921573638916, "step": 198 }, { "epoch": 0.004, "grad_norm": 6.78125, "grad_norm_var": 1.8954386393229166, "learning_rate": 0.0001, "loss": 7.7597, "loss/crossentropy": 2.1954251527786255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.49367184937000275, "step": 200 }, { "epoch": 0.00404, "grad_norm": 6.75, "grad_norm_var": 1.6848307291666667, "learning_rate": 0.0001, "loss": 7.9033, "loss/crossentropy": 2.81479811668396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5567455887794495, "step": 202 }, { "epoch": 0.00408, "grad_norm": 6.5, "grad_norm_var": 1.08228759765625, "learning_rate": 0.0001, "loss": 7.9812, "loss/crossentropy": 2.611761450767517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.544409453868866, "step": 204 }, { "epoch": 0.00412, "grad_norm": 6.125, "grad_norm_var": 0.650244140625, "learning_rate": 0.0001, "loss": 7.7921, "loss/crossentropy": 2.1369245052337646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5181446373462677, "step": 206 }, { "epoch": 0.00416, "grad_norm": 7.1875, "grad_norm_var": 0.30745035807291665, "learning_rate": 0.0001, "loss": 8.3375, "loss/crossentropy": 2.435856580734253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5506252646446228, "step": 208 }, { "epoch": 0.0042, "grad_norm": 6.21875, "grad_norm_var": 0.26848958333333334, "learning_rate": 0.0001, "loss": 7.7599, "loss/crossentropy": 2.2404768466949463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46975645422935486, "step": 210 }, { "epoch": 0.00424, "grad_norm": 6.71875, "grad_norm_var": 0.19713541666666667, "learning_rate": 0.0001, "loss": 7.7083, "loss/crossentropy": 2.4866777658462524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5217231214046478, "step": 212 }, { "epoch": 0.00428, "grad_norm": 6.15625, "grad_norm_var": 0.121484375, "learning_rate": 0.0001, "loss": 7.6519, "loss/crossentropy": 2.074867010116577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4683506190776825, "step": 214 }, { "epoch": 0.00432, "grad_norm": 7.59375, "grad_norm_var": 0.21679280598958334, "learning_rate": 0.0001, "loss": 7.6062, "loss/crossentropy": 2.2040151357650757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5235520601272583, "step": 216 }, { "epoch": 0.00436, "grad_norm": 7.25, "grad_norm_var": 0.25690104166666666, "learning_rate": 0.0001, "loss": 7.886, "loss/crossentropy": 2.174479365348816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5243187248706818, "step": 218 }, { "epoch": 0.0044, "grad_norm": 6.28125, "grad_norm_var": 0.26523030598958336, "learning_rate": 0.0001, "loss": 7.7535, "loss/crossentropy": 2.5678584575653076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5818615555763245, "step": 220 }, { "epoch": 0.00444, "grad_norm": 6.0625, "grad_norm_var": 0.2814412434895833, "learning_rate": 0.0001, "loss": 7.859, "loss/crossentropy": 2.4551891088485718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5400412082672119, "step": 222 }, { "epoch": 0.00448, "grad_norm": 6.15625, "grad_norm_var": 0.25310872395833334, "learning_rate": 0.0001, "loss": 7.7341, "loss/crossentropy": 2.0638335943222046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.49279752373695374, "step": 224 }, { "epoch": 0.00452, "grad_norm": 6.6875, "grad_norm_var": 0.38235677083333336, "learning_rate": 0.0001, "loss": 8.1064, "loss/crossentropy": 2.553247332572937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5566797852516174, "step": 226 }, { "epoch": 0.00456, "grad_norm": 6.0625, "grad_norm_var": 0.39451497395833335, "learning_rate": 0.0001, "loss": 7.7812, "loss/crossentropy": 2.544332265853882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5118530094623566, "step": 228 }, { "epoch": 0.0046, "grad_norm": 5.875, "grad_norm_var": 0.4054036458333333, "learning_rate": 0.0001, "loss": 7.064, "loss/crossentropy": 2.191234052181244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5157675743103027, "step": 230 }, { "epoch": 0.00464, "grad_norm": 5.4375, "grad_norm_var": 0.360791015625, "learning_rate": 0.0001, "loss": 7.6611, "loss/crossentropy": 2.3671151399612427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5233491659164429, "step": 232 }, { "epoch": 0.00468, "grad_norm": 7.125, "grad_norm_var": 0.4180826822916667, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 2.3003920316696167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5037694871425629, "step": 234 }, { "epoch": 0.00472, "grad_norm": 5.625, "grad_norm_var": 0.4305338541666667, "learning_rate": 0.0001, "loss": 7.8236, "loss/crossentropy": 2.4672670364379883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5187652707099915, "step": 236 }, { "epoch": 0.00476, "grad_norm": 6.0625, "grad_norm_var": 0.4493326822916667, "learning_rate": 0.0001, "loss": 7.3246, "loss/crossentropy": 2.179289937019348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5111505687236786, "step": 238 }, { "epoch": 0.0048, "grad_norm": 6.34375, "grad_norm_var": 0.5123697916666666, "learning_rate": 0.0001, "loss": 7.6064, "loss/crossentropy": 2.2424627542495728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5187103897333145, "step": 240 }, { "epoch": 0.00484, "grad_norm": 5.96875, "grad_norm_var": 0.2986979166666667, "learning_rate": 0.0001, "loss": 7.8108, "loss/crossentropy": 2.8024520874023438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5700482130050659, "step": 242 }, { "epoch": 0.00488, "grad_norm": 6.25, "grad_norm_var": 0.4554524739583333, "learning_rate": 0.0001, "loss": 7.6644, "loss/crossentropy": 2.3653491735458374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5232449471950531, "step": 244 }, { "epoch": 0.00492, "grad_norm": 6.90625, "grad_norm_var": 0.48513997395833336, "learning_rate": 0.0001, "loss": 7.5028, "loss/crossentropy": 2.6201778650283813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5684538185596466, "step": 246 }, { "epoch": 0.00496, "grad_norm": 7.3125, "grad_norm_var": 0.49654541015625, "learning_rate": 0.0001, "loss": 7.6631, "loss/crossentropy": 2.2811471819877625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5543638169765472, "step": 248 }, { "epoch": 0.005, "grad_norm": 5.625, "grad_norm_var": 0.48544514973958336, "learning_rate": 0.0001, "loss": 7.6914, "loss/crossentropy": 2.4381459951400757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5594777166843414, "step": 250 }, { "epoch": 0.00504, "grad_norm": 9.375, "grad_norm_var": 1.04000244140625, "learning_rate": 0.0001, "loss": 7.7346, "loss/crossentropy": 2.435782313346863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5159732699394226, "step": 252 }, { "epoch": 0.00508, "grad_norm": 5.46875, "grad_norm_var": 1.0892862955729166, "learning_rate": 0.0001, "loss": 8.0282, "loss/crossentropy": 2.7867215871810913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.551640123128891, "step": 254 }, { "epoch": 0.00512, "grad_norm": 5.46875, "grad_norm_var": 1.2432902018229166, "learning_rate": 0.0001, "loss": 7.0744, "loss/crossentropy": 1.9328945875167847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45153285562992096, "step": 256 }, { "epoch": 0.00516, "grad_norm": 6.21875, "grad_norm_var": 1.2460286458333334, "learning_rate": 0.0001, "loss": 7.292, "loss/crossentropy": 2.552613139152527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5215992629528046, "step": 258 }, { "epoch": 0.0052, "grad_norm": 5.40625, "grad_norm_var": 1.1848958333333333, "learning_rate": 0.0001, "loss": 7.4992, "loss/crossentropy": 2.3720492124557495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5024219453334808, "step": 260 }, { "epoch": 0.00524, "grad_norm": 8.1875, "grad_norm_var": 1.4898274739583333, "learning_rate": 0.0001, "loss": 7.4676, "loss/crossentropy": 2.465815782546997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5383751839399338, "step": 262 }, { "epoch": 0.00528, "grad_norm": 6.0625, "grad_norm_var": 1.5113240559895833, "learning_rate": 0.0001, "loss": 7.3163, "loss/crossentropy": 2.2791935205459595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.534159854054451, "step": 264 }, { "epoch": 0.00532, "grad_norm": 6.28125, "grad_norm_var": 1.3855305989583333, "learning_rate": 0.0001, "loss": 7.9279, "loss/crossentropy": 2.48906409740448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5344790518283844, "step": 266 }, { "epoch": 0.00536, "grad_norm": 4.90625, "grad_norm_var": 1.0180826822916667, "learning_rate": 0.0001, "loss": 7.3178, "loss/crossentropy": 2.0858306884765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.47646892070770264, "step": 268 }, { "epoch": 0.0054, "grad_norm": 8.375, "grad_norm_var": 1.18258056640625, "learning_rate": 0.0001, "loss": 7.383, "loss/crossentropy": 2.159322738647461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5543113648891449, "step": 270 }, { "epoch": 0.00544, "grad_norm": 4.9375, "grad_norm_var": 1.204931640625, "learning_rate": 0.0001, "loss": 7.1635, "loss/crossentropy": 2.249913454055786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4896702766418457, "step": 272 }, { "epoch": 0.00548, "grad_norm": 8.125, "grad_norm_var": 1.388916015625, "learning_rate": 0.0001, "loss": 7.3565, "loss/crossentropy": 1.998712420463562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4770403504371643, "step": 274 }, { "epoch": 0.00552, "grad_norm": 5.53125, "grad_norm_var": 1.9266764322916667, "learning_rate": 0.0001, "loss": 7.6522, "loss/crossentropy": 2.391260862350464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5259605348110199, "step": 276 }, { "epoch": 0.00556, "grad_norm": 6.1875, "grad_norm_var": 1.6884765625, "learning_rate": 0.0001, "loss": 7.2935, "loss/crossentropy": 2.523361325263977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4724537879228592, "step": 278 }, { "epoch": 0.0056, "grad_norm": 5.34375, "grad_norm_var": 1.7386555989583334, "learning_rate": 0.0001, "loss": 7.3505, "loss/crossentropy": 2.281963586807251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.49724647402763367, "step": 280 }, { "epoch": 0.00564, "grad_norm": 5.0625, "grad_norm_var": 1.8446451822916667, "learning_rate": 0.0001, "loss": 7.1079, "loss/crossentropy": 2.2403814792633057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45384156703948975, "step": 282 }, { "epoch": 0.00568, "grad_norm": 5.6875, "grad_norm_var": 1.670166015625, "learning_rate": 0.0001, "loss": 7.4318, "loss/crossentropy": 2.2687963247299194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46648281812667847, "step": 284 }, { "epoch": 0.00572, "grad_norm": 5.75, "grad_norm_var": 1.38541259765625, "learning_rate": 0.0001, "loss": 7.3035, "loss/crossentropy": 2.3336217403411865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4792183041572571, "step": 286 }, { "epoch": 0.00576, "grad_norm": 6.4375, "grad_norm_var": 1.292431640625, "learning_rate": 0.0001, "loss": 7.0031, "loss/crossentropy": 2.4006571769714355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.473391056060791, "step": 288 }, { "epoch": 0.0058, "grad_norm": 5.09375, "grad_norm_var": 1.18671875, "learning_rate": 0.0001, "loss": 6.8037, "loss/crossentropy": 2.0306124687194824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4722501188516617, "step": 290 }, { "epoch": 0.00584, "grad_norm": 5.34375, "grad_norm_var": 0.38865559895833335, "learning_rate": 0.0001, "loss": 7.1378, "loss/crossentropy": 2.412277102470398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48340730369091034, "step": 292 }, { "epoch": 0.00588, "grad_norm": 6.21875, "grad_norm_var": 0.3952433268229167, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 2.3195769786834717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5074818134307861, "step": 294 }, { "epoch": 0.00592, "grad_norm": 6.71875, "grad_norm_var": 0.2867024739583333, "learning_rate": 0.0001, "loss": 7.2826, "loss/crossentropy": 2.3265275955200195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5351093411445618, "step": 296 }, { "epoch": 0.00596, "grad_norm": 5.625, "grad_norm_var": 0.26171468098958334, "learning_rate": 0.0001, "loss": 7.0106, "loss/crossentropy": 2.210574746131897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45469868183135986, "step": 298 }, { "epoch": 0.006, "grad_norm": 6.3125, "grad_norm_var": 0.3034138997395833, "learning_rate": 0.0001, "loss": 7.4741, "loss/crossentropy": 2.347964644432068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45269395411014557, "step": 300 }, { "epoch": 0.00604, "grad_norm": 5.0, "grad_norm_var": 0.34308268229166666, "learning_rate": 0.0001, "loss": 6.8901, "loss/crossentropy": 2.197494626045227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45010584592819214, "step": 302 }, { "epoch": 0.00608, "grad_norm": 5.8125, "grad_norm_var": 0.32224934895833335, "learning_rate": 0.0001, "loss": 6.909, "loss/crossentropy": 2.3295196890830994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48989084362983704, "step": 304 }, { "epoch": 0.00612, "grad_norm": 6.3125, "grad_norm_var": 0.26347249348958335, "learning_rate": 0.0001, "loss": 7.6606, "loss/crossentropy": 2.5208678245544434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46991507709026337, "step": 306 }, { "epoch": 0.00616, "grad_norm": 5.40625, "grad_norm_var": 0.22849934895833332, "learning_rate": 0.0001, "loss": 7.2881, "loss/crossentropy": 2.6091307401657104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5090319812297821, "step": 308 }, { "epoch": 0.0062, "grad_norm": 5.4375, "grad_norm_var": 0.3083984375, "learning_rate": 0.0001, "loss": 7.0888, "loss/crossentropy": 2.4142966270446777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45959460735321045, "step": 310 }, { "epoch": 0.00624, "grad_norm": 6.0625, "grad_norm_var": 0.25006103515625, "learning_rate": 0.0001, "loss": 7.3054, "loss/crossentropy": 2.3062673807144165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4381408095359802, "step": 312 }, { "epoch": 0.00628, "grad_norm": 4.875, "grad_norm_var": 0.29498697916666666, "learning_rate": 0.0001, "loss": 6.5202, "loss/crossentropy": 2.1124885082244873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4392775595188141, "step": 314 }, { "epoch": 0.00632, "grad_norm": 5.09375, "grad_norm_var": 0.3001302083333333, "learning_rate": 0.0001, "loss": 6.3297, "loss/crossentropy": 2.0250568985939026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4133095294237137, "step": 316 }, { "epoch": 0.00636, "grad_norm": 5.625, "grad_norm_var": 0.31021728515625, "learning_rate": 0.0001, "loss": 6.9903, "loss/crossentropy": 2.4011316299438477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46330246329307556, "step": 318 }, { "epoch": 0.0064, "grad_norm": 5.65625, "grad_norm_var": 0.30305582682291665, "learning_rate": 0.0001, "loss": 7.2114, "loss/crossentropy": 2.487559676170349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.47237157821655273, "step": 320 }, { "epoch": 0.00644, "grad_norm": 5.1875, "grad_norm_var": 0.2775349934895833, "learning_rate": 0.0001, "loss": 6.5935, "loss/crossentropy": 1.999566912651062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4165455400943756, "step": 322 }, { "epoch": 0.00648, "grad_norm": 6.03125, "grad_norm_var": 0.27496337890625, "learning_rate": 0.0001, "loss": 7.0573, "loss/crossentropy": 2.545841693878174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4725654572248459, "step": 324 }, { "epoch": 0.00652, "grad_norm": 5.0625, "grad_norm_var": 0.2528483072916667, "learning_rate": 0.0001, "loss": 7.2351, "loss/crossentropy": 2.119086444377899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4649319499731064, "step": 326 }, { "epoch": 0.00656, "grad_norm": 5.21875, "grad_norm_var": 0.221728515625, "learning_rate": 0.0001, "loss": 6.8367, "loss/crossentropy": 2.365525245666504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5121739506721497, "step": 328 }, { "epoch": 0.0066, "grad_norm": 5.25, "grad_norm_var": 0.17467041015625, "learning_rate": 0.0001, "loss": 6.8384, "loss/crossentropy": 2.2604740858078003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4704676419496536, "step": 330 }, { "epoch": 0.00664, "grad_norm": 6.15625, "grad_norm_var": 0.18977864583333334, "learning_rate": 0.0001, "loss": 7.5125, "loss/crossentropy": 2.4891955852508545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5196183770895004, "step": 332 }, { "epoch": 0.00668, "grad_norm": 5.46875, "grad_norm_var": 0.1767578125, "learning_rate": 0.0001, "loss": 7.3139, "loss/crossentropy": 2.430082321166992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.47671228647232056, "step": 334 }, { "epoch": 0.00672, "grad_norm": 5.53125, "grad_norm_var": 0.187353515625, "learning_rate": 0.0001, "loss": 6.6969, "loss/crossentropy": 2.2450510263442993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5124029517173767, "step": 336 }, { "epoch": 0.00676, "grad_norm": 5.375, "grad_norm_var": 0.18251546223958334, "learning_rate": 0.0001, "loss": 6.8537, "loss/crossentropy": 2.225212812423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4358299970626831, "step": 338 }, { "epoch": 0.0068, "grad_norm": 6.71875, "grad_norm_var": 26.47734375, "learning_rate": 0.0001, "loss": 6.8775, "loss/crossentropy": 2.320846140384674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42757023870944977, "step": 340 }, { "epoch": 0.00684, "grad_norm": 4.84375, "grad_norm_var": 26.739453125, "learning_rate": 0.0001, "loss": 6.7394, "loss/crossentropy": 2.419093132019043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4633703678846359, "step": 342 }, { "epoch": 0.00688, "grad_norm": 5.3125, "grad_norm_var": 26.632405598958332, "learning_rate": 0.0001, "loss": 6.7304, "loss/crossentropy": 1.939517080783844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4387335330247879, "step": 344 }, { "epoch": 0.00692, "grad_norm": 5.75, "grad_norm_var": 26.504410807291666, "learning_rate": 0.0001, "loss": 7.0914, "loss/crossentropy": 2.695888638496399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48031529784202576, "step": 346 }, { "epoch": 0.00696, "grad_norm": 6.625, "grad_norm_var": 26.468094889322916, "learning_rate": 0.0001, "loss": 6.8381, "loss/crossentropy": 2.245330333709717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5032568573951721, "step": 348 }, { "epoch": 0.007, "grad_norm": 4.28125, "grad_norm_var": 26.707421875, "learning_rate": 0.0001, "loss": 6.4774, "loss/crossentropy": 1.9668607115745544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4390462785959244, "step": 350 }, { "epoch": 0.00704, "grad_norm": 5.25, "grad_norm_var": 26.92008056640625, "learning_rate": 0.0001, "loss": 6.7795, "loss/crossentropy": 2.4035123586654663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43734824657440186, "step": 352 }, { "epoch": 0.00708, "grad_norm": 6.5, "grad_norm_var": 26.851460774739582, "learning_rate": 0.0001, "loss": 7.3932, "loss/crossentropy": 2.4636529684066772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5208825469017029, "step": 354 }, { "epoch": 0.00712, "grad_norm": 6.4375, "grad_norm_var": 0.5743123372395833, "learning_rate": 0.0001, "loss": 6.8482, "loss/crossentropy": 2.085066556930542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3971068561077118, "step": 356 }, { "epoch": 0.00716, "grad_norm": 4.59375, "grad_norm_var": 0.57847900390625, "learning_rate": 0.0001, "loss": 6.8962, "loss/crossentropy": 2.194266200065613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41747787594795227, "step": 358 }, { "epoch": 0.0072, "grad_norm": 4.59375, "grad_norm_var": 0.649462890625, "learning_rate": 0.0001, "loss": 6.7381, "loss/crossentropy": 2.4678618907928467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4633233994245529, "step": 360 }, { "epoch": 0.00724, "grad_norm": 5.59375, "grad_norm_var": 0.6476847330729166, "learning_rate": 0.0001, "loss": 6.5748, "loss/crossentropy": 2.362962484359741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4360974431037903, "step": 362 }, { "epoch": 0.00728, "grad_norm": 5.6875, "grad_norm_var": 0.5360514322916666, "learning_rate": 0.0001, "loss": 7.3497, "loss/crossentropy": 2.3162096738815308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4641396403312683, "step": 364 }, { "epoch": 0.00732, "grad_norm": 6.0625, "grad_norm_var": 0.4325358072916667, "learning_rate": 0.0001, "loss": 7.0856, "loss/crossentropy": 2.279396176338196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4331911951303482, "step": 366 }, { "epoch": 0.00736, "grad_norm": 4.96875, "grad_norm_var": 0.377587890625, "learning_rate": 0.0001, "loss": 6.8288, "loss/crossentropy": 2.333961606025696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43770918250083923, "step": 368 }, { "epoch": 0.0074, "grad_norm": 5.1875, "grad_norm_var": 0.27955322265625, "learning_rate": 0.0001, "loss": 6.9245, "loss/crossentropy": 2.130259871482849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4134685546159744, "step": 370 }, { "epoch": 0.00744, "grad_norm": 5.59375, "grad_norm_var": 0.19478759765625, "learning_rate": 0.0001, "loss": 6.4884, "loss/crossentropy": 2.3000282049179077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46207693219184875, "step": 372 }, { "epoch": 0.00748, "grad_norm": 5.59375, "grad_norm_var": 0.20399983723958334, "learning_rate": 0.0001, "loss": 7.3714, "loss/crossentropy": 2.687412142753601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46891947090625763, "step": 374 }, { "epoch": 0.00752, "grad_norm": 4.28125, "grad_norm_var": 0.21962483723958334, "learning_rate": 0.0001, "loss": 6.4194, "loss/crossentropy": 2.2366563081741333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42876073718070984, "step": 376 }, { "epoch": 0.00756, "grad_norm": 5.71875, "grad_norm_var": 0.23108317057291666, "learning_rate": 0.0001, "loss": 7.0141, "loss/crossentropy": 2.5960274934768677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4519210159778595, "step": 378 }, { "epoch": 0.0076, "grad_norm": 5.71875, "grad_norm_var": 0.23435872395833332, "learning_rate": 0.0001, "loss": 6.9654, "loss/crossentropy": 2.4690704345703125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5009289383888245, "step": 380 }, { "epoch": 0.00764, "grad_norm": 5.25, "grad_norm_var": 0.17952067057291668, "learning_rate": 0.0001, "loss": 6.6068, "loss/crossentropy": 2.188890814781189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4912077784538269, "step": 382 }, { "epoch": 0.00768, "grad_norm": 5.15625, "grad_norm_var": 0.19055582682291666, "learning_rate": 0.0001, "loss": 6.5789, "loss/crossentropy": 2.2374125719070435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4437579810619354, "step": 384 }, { "epoch": 0.00772, "grad_norm": 5.1875, "grad_norm_var": 0.19308268229166667, "learning_rate": 0.0001, "loss": 6.8081, "loss/crossentropy": 2.101546287536621, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39717453718185425, "step": 386 }, { "epoch": 0.00776, "grad_norm": 5.1875, "grad_norm_var": 0.182666015625, "learning_rate": 0.0001, "loss": 6.5688, "loss/crossentropy": 2.2907408475875854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4657403528690338, "step": 388 }, { "epoch": 0.0078, "grad_norm": 4.53125, "grad_norm_var": 0.18203125, "learning_rate": 0.0001, "loss": 6.4664, "loss/crossentropy": 1.9909976720809937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39810416102409363, "step": 390 }, { "epoch": 0.00784, "grad_norm": 6.28125, "grad_norm_var": 0.23746337890625, "learning_rate": 0.0001, "loss": 6.6015, "loss/crossentropy": 2.109456777572632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4226441979408264, "step": 392 }, { "epoch": 0.00788, "grad_norm": 6.03125, "grad_norm_var": 0.26549072265625, "learning_rate": 0.0001, "loss": 7.2592, "loss/crossentropy": 2.330615997314453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5082881152629852, "step": 394 }, { "epoch": 0.00792, "grad_norm": 5.90625, "grad_norm_var": 0.49332275390625, "learning_rate": 0.0001, "loss": 7.301, "loss/crossentropy": 2.37632155418396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5454596877098083, "step": 396 }, { "epoch": 0.00796, "grad_norm": 4.9375, "grad_norm_var": 0.5669921875, "learning_rate": 0.0001, "loss": 6.6057, "loss/crossentropy": 2.0303866863250732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3780263066291809, "step": 398 }, { "epoch": 0.008, "grad_norm": 4.1875, "grad_norm_var": 0.6671712239583333, "learning_rate": 0.0001, "loss": 6.8129, "loss/crossentropy": 2.077945590019226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4111042767763138, "step": 400 }, { "epoch": 0.00804, "grad_norm": 5.5625, "grad_norm_var": 0.6867146809895833, "learning_rate": 0.0001, "loss": 6.919, "loss/crossentropy": 2.3042391538619995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44224822521209717, "step": 402 }, { "epoch": 0.00808, "grad_norm": 4.3125, "grad_norm_var": 0.74537353515625, "learning_rate": 0.0001, "loss": 6.4785, "loss/crossentropy": 2.15978467464447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44845885038375854, "step": 404 }, { "epoch": 0.00812, "grad_norm": 4.90625, "grad_norm_var": 0.69576416015625, "learning_rate": 0.0001, "loss": 6.3875, "loss/crossentropy": 2.4571259021759033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4629521369934082, "step": 406 }, { "epoch": 0.00816, "grad_norm": 5.40625, "grad_norm_var": 0.6654947916666667, "learning_rate": 0.0001, "loss": 7.1165, "loss/crossentropy": 2.653234601020813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5556869208812714, "step": 408 }, { "epoch": 0.0082, "grad_norm": 4.53125, "grad_norm_var": 0.66138916015625, "learning_rate": 0.0001, "loss": 6.6643, "loss/crossentropy": 1.9738067388534546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42570993304252625, "step": 410 }, { "epoch": 0.00824, "grad_norm": 4.0625, "grad_norm_var": 0.46920166015625, "learning_rate": 0.0001, "loss": 6.2205, "loss/crossentropy": 2.093988060951233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40544557571411133, "step": 412 }, { "epoch": 0.00828, "grad_norm": 4.34375, "grad_norm_var": 0.36213785807291665, "learning_rate": 0.0001, "loss": 6.6356, "loss/crossentropy": 2.4798851013183594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4204765260219574, "step": 414 }, { "epoch": 0.00832, "grad_norm": 5.625, "grad_norm_var": 0.3986287434895833, "learning_rate": 0.0001, "loss": 6.5601, "loss/crossentropy": 2.4342020750045776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5148662775754929, "step": 416 }, { "epoch": 0.00836, "grad_norm": 5.40625, "grad_norm_var": 0.3985514322916667, "learning_rate": 0.0001, "loss": 6.7757, "loss/crossentropy": 2.3637804985046387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45475171506404877, "step": 418 }, { "epoch": 0.0084, "grad_norm": 4.09375, "grad_norm_var": 0.39586181640625, "learning_rate": 0.0001, "loss": 6.6923, "loss/crossentropy": 2.4066261053085327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45181581377983093, "step": 420 }, { "epoch": 0.00844, "grad_norm": 4.0, "grad_norm_var": 0.43176676432291666, "learning_rate": 0.0001, "loss": 6.2428, "loss/crossentropy": 2.1273797750473022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3976929485797882, "step": 422 }, { "epoch": 0.00848, "grad_norm": 4.90625, "grad_norm_var": 0.26330973307291666, "learning_rate": 0.0001, "loss": 6.6524, "loss/crossentropy": 2.4227113723754883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46154454350471497, "step": 424 }, { "epoch": 0.00852, "grad_norm": 4.65625, "grad_norm_var": 0.261181640625, "learning_rate": 0.0001, "loss": 6.6558, "loss/crossentropy": 2.3502479791641235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43580910563468933, "step": 426 }, { "epoch": 0.00856, "grad_norm": 5.3125, "grad_norm_var": 0.26236572265625, "learning_rate": 0.0001, "loss": 6.903, "loss/crossentropy": 2.5034282207489014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4730219095945358, "step": 428 }, { "epoch": 0.0086, "grad_norm": 4.40625, "grad_norm_var": 0.2557576497395833, "learning_rate": 0.0001, "loss": 6.2148, "loss/crossentropy": 2.0902098417282104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.409458264708519, "step": 430 }, { "epoch": 0.00864, "grad_norm": 4.9375, "grad_norm_var": 0.19420166015625, "learning_rate": 0.0001, "loss": 6.4634, "loss/crossentropy": 2.2203429341316223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41066519916057587, "step": 432 }, { "epoch": 0.00868, "grad_norm": 5.03125, "grad_norm_var": 0.14855143229166667, "learning_rate": 0.0001, "loss": 6.6943, "loss/crossentropy": 2.568304419517517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48846572637557983, "step": 434 }, { "epoch": 0.00872, "grad_norm": 5.15625, "grad_norm_var": 0.13489176432291666, "learning_rate": 0.0001, "loss": 6.4829, "loss/crossentropy": 2.359646439552307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43610572814941406, "step": 436 }, { "epoch": 0.00876, "grad_norm": 5.03125, "grad_norm_var": 0.08870035807291667, "learning_rate": 0.0001, "loss": 6.6119, "loss/crossentropy": 2.2751121520996094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45837563276290894, "step": 438 }, { "epoch": 0.0088, "grad_norm": 4.125, "grad_norm_var": 0.13665364583333334, "learning_rate": 0.0001, "loss": 6.5338, "loss/crossentropy": 2.334506392478943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4196038395166397, "step": 440 }, { "epoch": 0.00884, "grad_norm": 4.1875, "grad_norm_var": 0.15891927083333332, "learning_rate": 0.0001, "loss": 6.2206, "loss/crossentropy": 1.9731069803237915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41661541163921356, "step": 442 }, { "epoch": 0.00888, "grad_norm": 5.03125, "grad_norm_var": 0.13339436848958333, "learning_rate": 0.0001, "loss": 6.3377, "loss/crossentropy": 2.319058418273926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4656156301498413, "step": 444 }, { "epoch": 0.00892, "grad_norm": 4.125, "grad_norm_var": 0.15013020833333332, "learning_rate": 0.0001, "loss": 6.5345, "loss/crossentropy": 2.309122085571289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41400712728500366, "step": 446 }, { "epoch": 0.00896, "grad_norm": 4.53125, "grad_norm_var": 0.14468994140625, "learning_rate": 0.0001, "loss": 6.5385, "loss/crossentropy": 1.867617905139923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39080700278282166, "step": 448 }, { "epoch": 0.009, "grad_norm": 5.21875, "grad_norm_var": 0.79830322265625, "learning_rate": 0.0001, "loss": 6.799, "loss/crossentropy": 2.205033838748932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4887084364891052, "step": 450 }, { "epoch": 0.00904, "grad_norm": 4.9375, "grad_norm_var": 0.826171875, "learning_rate": 0.0001, "loss": 6.6476, "loss/crossentropy": 2.3054174184799194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4140937328338623, "step": 452 }, { "epoch": 0.00908, "grad_norm": 4.53125, "grad_norm_var": 0.8572224934895833, "learning_rate": 0.0001, "loss": 6.7036, "loss/crossentropy": 2.1358219981193542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3849467635154724, "step": 454 }, { "epoch": 0.00912, "grad_norm": 4.9375, "grad_norm_var": 0.8132120768229166, "learning_rate": 0.0001, "loss": 6.4024, "loss/crossentropy": 1.9811997413635254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42263032495975494, "step": 456 }, { "epoch": 0.00916, "grad_norm": 4.8125, "grad_norm_var": 0.7787109375, "learning_rate": 0.0001, "loss": 6.8381, "loss/crossentropy": 2.319555103778839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4539293050765991, "step": 458 }, { "epoch": 0.0092, "grad_norm": 4.6875, "grad_norm_var": 0.7744425455729167, "learning_rate": 0.0001, "loss": 6.72, "loss/crossentropy": 2.4030569791793823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41373930871486664, "step": 460 }, { "epoch": 0.00924, "grad_norm": 4.8125, "grad_norm_var": 0.7218587239583333, "learning_rate": 0.0001, "loss": 6.7376, "loss/crossentropy": 2.4479328393936157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.447835311293602, "step": 462 }, { "epoch": 0.00928, "grad_norm": 5.5, "grad_norm_var": 0.7289021809895834, "learning_rate": 0.0001, "loss": 7.0562, "loss/crossentropy": 2.056324601173401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4374549984931946, "step": 464 }, { "epoch": 0.00932, "grad_norm": 4.28125, "grad_norm_var": 0.19256184895833334, "learning_rate": 0.0001, "loss": 6.6994, "loss/crossentropy": 2.4104079008102417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4148041307926178, "step": 466 }, { "epoch": 0.00936, "grad_norm": 4.75, "grad_norm_var": 0.13527018229166668, "learning_rate": 0.0001, "loss": 7.0758, "loss/crossentropy": 2.734652876853943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.511719822883606, "step": 468 }, { "epoch": 0.0094, "grad_norm": 5.0, "grad_norm_var": 0.13118082682291668, "learning_rate": 0.0001, "loss": 6.4772, "loss/crossentropy": 2.1748571395874023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3852091133594513, "step": 470 }, { "epoch": 0.00944, "grad_norm": 4.625, "grad_norm_var": 0.13912353515625, "learning_rate": 0.0001, "loss": 6.7127, "loss/crossentropy": 2.476449966430664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4194856435060501, "step": 472 }, { "epoch": 0.00948, "grad_norm": 4.40625, "grad_norm_var": 0.16887613932291667, "learning_rate": 0.0001, "loss": 6.494, "loss/crossentropy": 2.6383973360061646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44083887338638306, "step": 474 }, { "epoch": 0.00952, "grad_norm": 4.375, "grad_norm_var": 0.18435872395833333, "learning_rate": 0.0001, "loss": 6.3184, "loss/crossentropy": 2.3149259090423584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3863615244626999, "step": 476 }, { "epoch": 0.00956, "grad_norm": 5.0625, "grad_norm_var": 0.18388264973958332, "learning_rate": 0.0001, "loss": 6.6217, "loss/crossentropy": 2.3096635341644287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40995509922504425, "step": 478 }, { "epoch": 0.0096, "grad_norm": 4.34375, "grad_norm_var": 0.10846354166666666, "learning_rate": 0.0001, "loss": 6.4849, "loss/crossentropy": 2.6495853662490845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.412080317735672, "step": 480 }, { "epoch": 0.00964, "grad_norm": 5.21875, "grad_norm_var": 0.13948160807291668, "learning_rate": 0.0001, "loss": 6.9293, "loss/crossentropy": 2.445754885673523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45622071623802185, "step": 482 }, { "epoch": 0.00968, "grad_norm": 4.40625, "grad_norm_var": 0.13661702473958334, "learning_rate": 0.0001, "loss": 6.4119, "loss/crossentropy": 2.418110966682434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40424875915050507, "step": 484 }, { "epoch": 0.00972, "grad_norm": 5.0625, "grad_norm_var": 0.15959879557291667, "learning_rate": 0.0001, "loss": 6.6356, "loss/crossentropy": 1.9564435482025146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45834848284721375, "step": 486 }, { "epoch": 0.00976, "grad_norm": 5.375, "grad_norm_var": 0.16946207682291667, "learning_rate": 0.0001, "loss": 6.7056, "loss/crossentropy": 2.3772581815719604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4418800473213196, "step": 488 }, { "epoch": 0.0098, "grad_norm": 4.0625, "grad_norm_var": 0.208056640625, "learning_rate": 0.0001, "loss": 6.3239, "loss/crossentropy": 1.9526153802871704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3661540001630783, "step": 490 }, { "epoch": 0.00984, "grad_norm": 4.875, "grad_norm_var": 0.19999593098958332, "learning_rate": 0.0001, "loss": 6.7642, "loss/crossentropy": 2.40561842918396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44370119273662567, "step": 492 }, { "epoch": 0.00988, "grad_norm": 4.53125, "grad_norm_var": 0.1943359375, "learning_rate": 0.0001, "loss": 6.6475, "loss/crossentropy": 2.4316108226776123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4437306672334671, "step": 494 }, { "epoch": 0.00992, "grad_norm": 4.53125, "grad_norm_var": 0.25276285807291665, "learning_rate": 0.0001, "loss": 6.4095, "loss/crossentropy": 2.2919591665267944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4170967787504196, "step": 496 }, { "epoch": 0.00996, "grad_norm": 5.03125, "grad_norm_var": 0.21116129557291666, "learning_rate": 0.0001, "loss": 6.6291, "loss/crossentropy": 2.571357250213623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4073399156332016, "step": 498 }, { "epoch": 0.01, "grad_norm": 4.375, "grad_norm_var": 0.21330973307291667, "learning_rate": 0.0001, "loss": 6.2903, "loss/crossentropy": 2.389290928840637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41799379885196686, "step": 500 }, { "epoch": 0.01004, "grad_norm": 4.90625, "grad_norm_var": 0.20245768229166666, "learning_rate": 0.0001, "loss": 6.4319, "loss/crossentropy": 2.0904359221458435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3914492577314377, "step": 502 }, { "epoch": 0.01008, "grad_norm": 3.625, "grad_norm_var": 0.218212890625, "learning_rate": 0.0001, "loss": 6.5581, "loss/crossentropy": 2.5435129404067993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4660491645336151, "step": 504 }, { "epoch": 0.01012, "grad_norm": 4.75, "grad_norm_var": 0.21458333333333332, "learning_rate": 0.0001, "loss": 6.3665, "loss/crossentropy": 2.039083242416382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38188865780830383, "step": 506 }, { "epoch": 0.01016, "grad_norm": 5.21875, "grad_norm_var": 0.361181640625, "learning_rate": 0.0001, "loss": 6.4862, "loss/crossentropy": 2.056805729866028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4257620573043823, "step": 508 }, { "epoch": 0.0102, "grad_norm": 4.03125, "grad_norm_var": 0.38592122395833334, "learning_rate": 0.0001, "loss": 6.4822, "loss/crossentropy": 2.6178410053253174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43898941576480865, "step": 510 }, { "epoch": 0.01024, "grad_norm": 4.59375, "grad_norm_var": 0.3405558268229167, "learning_rate": 0.0001, "loss": 6.4638, "loss/crossentropy": 2.232435703277588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37769296765327454, "step": 512 }, { "epoch": 0.01028, "grad_norm": 4.40625, "grad_norm_var": 0.33033447265625, "learning_rate": 0.0001, "loss": 6.378, "loss/crossentropy": 1.8679735660552979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3805970698595047, "step": 514 }, { "epoch": 0.01032, "grad_norm": 4.3125, "grad_norm_var": 0.33175455729166664, "learning_rate": 0.0001, "loss": 6.7453, "loss/crossentropy": 2.6537472009658813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4450981914997101, "step": 516 }, { "epoch": 0.01036, "grad_norm": 4.03125, "grad_norm_var": 0.34915262858072915, "learning_rate": 0.0001, "loss": 6.2745, "loss/crossentropy": 2.5199841260910034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4748214781284332, "step": 518 }, { "epoch": 0.0104, "grad_norm": 5.5, "grad_norm_var": 0.3482004801432292, "learning_rate": 0.0001, "loss": 6.6212, "loss/crossentropy": 2.6603333950042725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4144483357667923, "step": 520 }, { "epoch": 0.01044, "grad_norm": 4.28125, "grad_norm_var": 0.3241119384765625, "learning_rate": 0.0001, "loss": 6.4073, "loss/crossentropy": 2.284039616584778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42801250517368317, "step": 522 }, { "epoch": 0.01048, "grad_norm": 3.78125, "grad_norm_var": 0.1894683837890625, "learning_rate": 0.0001, "loss": 6.2153, "loss/crossentropy": 2.473629951477051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4107852131128311, "step": 524 }, { "epoch": 0.01052, "grad_norm": 4.5, "grad_norm_var": 0.1827789306640625, "learning_rate": 0.0001, "loss": 6.6746, "loss/crossentropy": 2.1443774700164795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3606857359409332, "step": 526 }, { "epoch": 0.01056, "grad_norm": 3.96875, "grad_norm_var": 0.18889058430989583, "learning_rate": 0.0001, "loss": 5.8493, "loss/crossentropy": 1.8425135016441345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37770508229732513, "step": 528 }, { "epoch": 0.0106, "grad_norm": 3.859375, "grad_norm_var": 0.179052734375, "learning_rate": 0.0001, "loss": 6.3319, "loss/crossentropy": 2.3705164194107056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4354119151830673, "step": 530 }, { "epoch": 0.01064, "grad_norm": 4.53125, "grad_norm_var": 0.16344401041666667, "learning_rate": 0.0001, "loss": 6.4823, "loss/crossentropy": 2.1141316294670105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34213581681251526, "step": 532 }, { "epoch": 0.01068, "grad_norm": 4.3125, "grad_norm_var": 0.15579325358072918, "learning_rate": 0.0001, "loss": 6.0454, "loss/crossentropy": 2.14878511428833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3756616413593292, "step": 534 }, { "epoch": 0.01072, "grad_norm": 4.34375, "grad_norm_var": 0.052155558268229166, "learning_rate": 0.0001, "loss": 6.4363, "loss/crossentropy": 2.2513046264648438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.416190966963768, "step": 536 }, { "epoch": 0.01076, "grad_norm": 3.921875, "grad_norm_var": 0.049169921875, "learning_rate": 0.0001, "loss": 6.2674, "loss/crossentropy": 2.1337047815322876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3839789927005768, "step": 538 }, { "epoch": 0.0108, "grad_norm": 5.8125, "grad_norm_var": 0.20488993326822916, "learning_rate": 0.0001, "loss": 6.0559, "loss/crossentropy": 2.2019962072372437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3806414008140564, "step": 540 }, { "epoch": 0.01084, "grad_norm": 4.125, "grad_norm_var": 0.2133697509765625, "learning_rate": 0.0001, "loss": 5.8434, "loss/crossentropy": 2.113224983215332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797851800918579, "step": 542 }, { "epoch": 0.01088, "grad_norm": 4.0, "grad_norm_var": 0.21507059733072917, "learning_rate": 0.0001, "loss": 6.5131, "loss/crossentropy": 2.461037516593933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41689516603946686, "step": 544 }, { "epoch": 0.01092, "grad_norm": 4.625, "grad_norm_var": 0.24849853515625, "learning_rate": 0.0001, "loss": 6.4191, "loss/crossentropy": 2.277098298072815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4108494818210602, "step": 546 }, { "epoch": 0.01096, "grad_norm": 4.34375, "grad_norm_var": 0.48818359375, "learning_rate": 0.0001, "loss": 6.3436, "loss/crossentropy": 2.007936477661133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36796192824840546, "step": 548 }, { "epoch": 0.011, "grad_norm": 4.625, "grad_norm_var": 0.5631795247395833, "learning_rate": 0.0001, "loss": 6.5793, "loss/crossentropy": 2.197320520877838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38722094893455505, "step": 550 }, { "epoch": 0.01104, "grad_norm": 4.6875, "grad_norm_var": 0.5419230143229167, "learning_rate": 0.0001, "loss": 6.3185, "loss/crossentropy": 2.225432515144348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3928917348384857, "step": 552 }, { "epoch": 0.01108, "grad_norm": 4.28125, "grad_norm_var": 0.5455067952473959, "learning_rate": 0.0001, "loss": 5.9686, "loss/crossentropy": 2.5253326892852783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41547106206417084, "step": 554 }, { "epoch": 0.01112, "grad_norm": 4.15625, "grad_norm_var": 0.44996337890625, "learning_rate": 0.0001, "loss": 6.2158, "loss/crossentropy": 2.488635540008545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4523312896490097, "step": 556 }, { "epoch": 0.01116, "grad_norm": 4.03125, "grad_norm_var": 0.46119384765625, "learning_rate": 0.0001, "loss": 6.3632, "loss/crossentropy": 2.395568609237671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40695953369140625, "step": 558 }, { "epoch": 0.0112, "grad_norm": 4.28125, "grad_norm_var": 0.4615234375, "learning_rate": 0.0001, "loss": 5.998, "loss/crossentropy": 2.383823275566101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4482497274875641, "step": 560 }, { "epoch": 0.01124, "grad_norm": 4.46875, "grad_norm_var": 0.46280008951822915, "learning_rate": 0.0001, "loss": 5.968, "loss/crossentropy": 2.0261693000793457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36933301389217377, "step": 562 }, { "epoch": 0.01128, "grad_norm": 4.65625, "grad_norm_var": 0.2221588134765625, "learning_rate": 0.0001, "loss": 6.1467, "loss/crossentropy": 2.131769895553589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.341948002576828, "step": 564 }, { "epoch": 0.01132, "grad_norm": 4.8125, "grad_norm_var": 0.0982574462890625, "learning_rate": 0.0001, "loss": 6.5288, "loss/crossentropy": 2.3899158239364624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.417646586894989, "step": 566 }, { "epoch": 0.01136, "grad_norm": 4.21875, "grad_norm_var": 0.07683817545572917, "learning_rate": 0.0001, "loss": 6.6198, "loss/crossentropy": 2.3139528036117554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39010919630527496, "step": 568 }, { "epoch": 0.0114, "grad_norm": 4.3125, "grad_norm_var": 0.06782124837239584, "learning_rate": 0.0001, "loss": 6.1209, "loss/crossentropy": 1.966201364994049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3554569333791733, "step": 570 }, { "epoch": 0.01144, "grad_norm": 4.21875, "grad_norm_var": 0.07773335774739583, "learning_rate": 0.0001, "loss": 6.1746, "loss/crossentropy": 2.2325466871261597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4008040726184845, "step": 572 }, { "epoch": 0.01148, "grad_norm": 4.96875, "grad_norm_var": 0.09848531087239583, "learning_rate": 0.0001, "loss": 6.121, "loss/crossentropy": 1.7670194506645203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3362845778465271, "step": 574 }, { "epoch": 0.01152, "grad_norm": 4.125, "grad_norm_var": 0.10274149576822916, "learning_rate": 0.0001, "loss": 6.3956, "loss/crossentropy": 2.332284092903137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4190225303173065, "step": 576 }, { "epoch": 0.01156, "grad_norm": 4.71875, "grad_norm_var": 0.10549723307291667, "learning_rate": 0.0001, "loss": 6.4367, "loss/crossentropy": 2.265386700630188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4109695851802826, "step": 578 }, { "epoch": 0.0116, "grad_norm": 5.15625, "grad_norm_var": 0.13811442057291667, "learning_rate": 0.0001, "loss": 6.4475, "loss/crossentropy": 2.265889286994934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42450472712516785, "step": 580 }, { "epoch": 0.01164, "grad_norm": 4.5625, "grad_norm_var": 0.163134765625, "learning_rate": 0.0001, "loss": 6.1483, "loss/crossentropy": 1.867847204208374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3627365529537201, "step": 582 }, { "epoch": 0.01168, "grad_norm": 3.796875, "grad_norm_var": 0.23681538899739582, "learning_rate": 0.0001, "loss": 6.5909, "loss/crossentropy": 2.5827555656433105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5194894820451736, "step": 584 }, { "epoch": 0.01172, "grad_norm": 4.5625, "grad_norm_var": 0.23188374837239584, "learning_rate": 0.0001, "loss": 6.6319, "loss/crossentropy": 2.26031893491745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.412450835108757, "step": 586 }, { "epoch": 0.01176, "grad_norm": 5.53125, "grad_norm_var": 0.2718739827473958, "learning_rate": 0.0001, "loss": 6.5782, "loss/crossentropy": 2.1885104179382324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41636165976524353, "step": 588 }, { "epoch": 0.0118, "grad_norm": 4.53125, "grad_norm_var": 0.26240132649739584, "learning_rate": 0.0001, "loss": 6.5247, "loss/crossentropy": 2.682767391204834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4279082715511322, "step": 590 }, { "epoch": 0.01184, "grad_norm": 4.3125, "grad_norm_var": 0.24807027180989583, "learning_rate": 0.0001, "loss": 6.5862, "loss/crossentropy": 2.185304641723633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4266812950372696, "step": 592 }, { "epoch": 0.01188, "grad_norm": 4.15625, "grad_norm_var": 0.2589752197265625, "learning_rate": 0.0001, "loss": 6.2494, "loss/crossentropy": 2.383716344833374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38570962846279144, "step": 594 }, { "epoch": 0.01192, "grad_norm": 4.3125, "grad_norm_var": 0.2337066650390625, "learning_rate": 0.0001, "loss": 6.3066, "loss/crossentropy": 2.2963398694992065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43451687693595886, "step": 596 }, { "epoch": 0.01196, "grad_norm": 4.96875, "grad_norm_var": 0.2128570556640625, "learning_rate": 0.0001, "loss": 6.3368, "loss/crossentropy": 2.2728757858276367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4280686676502228, "step": 598 }, { "epoch": 0.012, "grad_norm": 4.34375, "grad_norm_var": 0.141650390625, "learning_rate": 0.0001, "loss": 6.1147, "loss/crossentropy": 2.3486615419387817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37224848568439484, "step": 600 }, { "epoch": 0.01204, "grad_norm": 3.765625, "grad_norm_var": 0.17512613932291668, "learning_rate": 0.0001, "loss": 6.4017, "loss/crossentropy": 2.3265292644500732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4079990088939667, "step": 602 }, { "epoch": 0.01208, "grad_norm": 3.84375, "grad_norm_var": 0.08847554524739583, "learning_rate": 0.0001, "loss": 6.0861, "loss/crossentropy": 2.45253849029541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42630523443222046, "step": 604 }, { "epoch": 0.01212, "grad_norm": 4.03125, "grad_norm_var": 0.08964742024739583, "learning_rate": 0.0001, "loss": 6.4101, "loss/crossentropy": 2.4417446851730347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40342913568019867, "step": 606 }, { "epoch": 0.01216, "grad_norm": 4.25, "grad_norm_var": 0.08886617024739583, "learning_rate": 0.0001, "loss": 6.2513, "loss/crossentropy": 2.1483529210090637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3846609443426132, "step": 608 }, { "epoch": 0.0122, "grad_norm": 4.1875, "grad_norm_var": 0.09990132649739583, "learning_rate": 0.0001, "loss": 6.5479, "loss/crossentropy": 2.481536865234375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.440000057220459, "step": 610 }, { "epoch": 0.01224, "grad_norm": 4.15625, "grad_norm_var": 0.09464518229166667, "learning_rate": 0.0001, "loss": 6.4986, "loss/crossentropy": 2.4472655057907104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41242682933807373, "step": 612 }, { "epoch": 0.01228, "grad_norm": 4.09375, "grad_norm_var": 0.07828369140625, "learning_rate": 0.0001, "loss": 6.4348, "loss/crossentropy": 2.3511135578155518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38262398540973663, "step": 614 }, { "epoch": 0.01232, "grad_norm": 3.71875, "grad_norm_var": 0.085791015625, "learning_rate": 0.0001, "loss": 6.4821, "loss/crossentropy": 2.5090683698654175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44584622979164124, "step": 616 }, { "epoch": 0.01236, "grad_norm": 4.75, "grad_norm_var": 0.1117340087890625, "learning_rate": 0.0001, "loss": 6.0395, "loss/crossentropy": 2.166012942790985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40271493792533875, "step": 618 }, { "epoch": 0.0124, "grad_norm": 5.09375, "grad_norm_var": 0.15511067708333334, "learning_rate": 0.0001, "loss": 6.4644, "loss/crossentropy": 2.583309531211853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43380285799503326, "step": 620 }, { "epoch": 0.01244, "grad_norm": 3.984375, "grad_norm_var": 0.19877827962239583, "learning_rate": 0.0001, "loss": 6.3289, "loss/crossentropy": 2.125720262527466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40389589965343475, "step": 622 }, { "epoch": 0.01248, "grad_norm": 6.34375, "grad_norm_var": 0.4984283447265625, "learning_rate": 0.0001, "loss": 5.9651, "loss/crossentropy": 1.7034094333648682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31466029584407806, "step": 624 }, { "epoch": 0.01252, "grad_norm": 4.40625, "grad_norm_var": 0.4901041666666667, "learning_rate": 0.0001, "loss": 5.9451, "loss/crossentropy": 2.163281202316284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38622182607650757, "step": 626 }, { "epoch": 0.01256, "grad_norm": 3.875, "grad_norm_var": 0.4982818603515625, "learning_rate": 0.0001, "loss": 5.8931, "loss/crossentropy": 1.7754453420639038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33256995677948, "step": 628 }, { "epoch": 0.0126, "grad_norm": 4.09375, "grad_norm_var": 0.4894683837890625, "learning_rate": 0.0001, "loss": 6.8606, "loss/crossentropy": 2.273309350013733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5319447964429855, "step": 630 }, { "epoch": 0.01264, "grad_norm": 3.828125, "grad_norm_var": 0.480126953125, "learning_rate": 0.0001, "loss": 6.2103, "loss/crossentropy": 2.397401988506317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4255771040916443, "step": 632 }, { "epoch": 0.01268, "grad_norm": 5.28125, "grad_norm_var": 0.521875, "learning_rate": 0.0001, "loss": 6.4179, "loss/crossentropy": 2.3898611068725586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46314406394958496, "step": 634 }, { "epoch": 0.01272, "grad_norm": 7.40625, "grad_norm_var": 1.0609212239583334, "learning_rate": 0.0001, "loss": 6.5634, "loss/crossentropy": 2.3740471601486206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48123428225517273, "step": 636 }, { "epoch": 0.01276, "grad_norm": 3.90625, "grad_norm_var": 1.0519846598307292, "learning_rate": 0.0001, "loss": 6.1136, "loss/crossentropy": 2.236217498779297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39564159512519836, "step": 638 }, { "epoch": 0.0128, "grad_norm": 4.21875, "grad_norm_var": 0.7972157796223959, "learning_rate": 0.0001, "loss": 6.3134, "loss/crossentropy": 2.4049174785614014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41312308609485626, "step": 640 }, { "epoch": 0.01284, "grad_norm": 4.0, "grad_norm_var": 0.80142822265625, "learning_rate": 0.0001, "loss": 6.2726, "loss/crossentropy": 2.173800766468048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3983110189437866, "step": 642 }, { "epoch": 0.01288, "grad_norm": 4.65625, "grad_norm_var": 0.80084228515625, "learning_rate": 0.0001, "loss": 6.2779, "loss/crossentropy": 2.2124537229537964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38358184695243835, "step": 644 }, { "epoch": 0.01292, "grad_norm": 3.90625, "grad_norm_var": 0.8355377197265625, "learning_rate": 0.0001, "loss": 6.1209, "loss/crossentropy": 2.4939264059066772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3805152475833893, "step": 646 }, { "epoch": 0.01296, "grad_norm": 4.25, "grad_norm_var": 0.8250935872395834, "learning_rate": 0.0001, "loss": 6.0714, "loss/crossentropy": 2.4526472091674805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3958089202642441, "step": 648 }, { "epoch": 0.013, "grad_norm": 4.25, "grad_norm_var": 0.7562978108723958, "learning_rate": 0.0001, "loss": 6.3648, "loss/crossentropy": 2.5171029567718506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.446771502494812, "step": 650 }, { "epoch": 0.01304, "grad_norm": 4.125, "grad_norm_var": 0.1185943603515625, "learning_rate": 0.0001, "loss": 6.1656, "loss/crossentropy": 2.270598888397217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38787929713726044, "step": 652 }, { "epoch": 0.01308, "grad_norm": 4.15625, "grad_norm_var": 0.11404520670572917, "learning_rate": 0.0001, "loss": 5.7739, "loss/crossentropy": 1.8847617506980896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3258303850889206, "step": 654 }, { "epoch": 0.01312, "grad_norm": 3.9375, "grad_norm_var": 0.11286519368489584, "learning_rate": 0.0001, "loss": 6.042, "loss/crossentropy": 2.2471452951431274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38874460756778717, "step": 656 }, { "epoch": 0.01316, "grad_norm": 4.125, "grad_norm_var": 0.10369364420572917, "learning_rate": 0.0001, "loss": 6.4383, "loss/crossentropy": 2.3776252269744873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40553848445415497, "step": 658 }, { "epoch": 0.0132, "grad_norm": 3.84375, "grad_norm_var": 0.06461588541666667, "learning_rate": 0.0001, "loss": 5.5389, "loss/crossentropy": 2.291012167930603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3630271404981613, "step": 660 }, { "epoch": 0.01324, "grad_norm": 3.90625, "grad_norm_var": 0.06116536458333333, "learning_rate": 0.0001, "loss": 6.298, "loss/crossentropy": 2.2029112577438354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37700483202934265, "step": 662 }, { "epoch": 0.01328, "grad_norm": 3.984375, "grad_norm_var": 0.0598541259765625, "learning_rate": 0.0001, "loss": 6.4093, "loss/crossentropy": 2.571411967277527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5071892440319061, "step": 664 }, { "epoch": 0.01332, "grad_norm": 3.484375, "grad_norm_var": 0.045947265625, "learning_rate": 0.0001, "loss": 5.6839, "loss/crossentropy": 2.148792862892151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3591457009315491, "step": 666 }, { "epoch": 0.01336, "grad_norm": 4.09375, "grad_norm_var": 0.04755452473958333, "learning_rate": 0.0001, "loss": 6.4444, "loss/crossentropy": 2.5091140270233154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3882133811712265, "step": 668 }, { "epoch": 0.0134, "grad_norm": 4.09375, "grad_norm_var": 0.049332682291666666, "learning_rate": 0.0001, "loss": 6.15, "loss/crossentropy": 2.4669524431228638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3941466957330704, "step": 670 }, { "epoch": 0.01344, "grad_norm": 3.9375, "grad_norm_var": 0.06620686848958333, "learning_rate": 0.0001, "loss": 6.5358, "loss/crossentropy": 2.4111422300338745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3973373472690582, "step": 672 }, { "epoch": 0.01348, "grad_norm": 4.1875, "grad_norm_var": 0.06495768229166667, "learning_rate": 0.0001, "loss": 5.765, "loss/crossentropy": 2.1109864115715027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36189010739326477, "step": 674 }, { "epoch": 0.01352, "grad_norm": 3.578125, "grad_norm_var": 0.0744140625, "learning_rate": 0.0001, "loss": 6.0289, "loss/crossentropy": 2.069494664669037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3562029302120209, "step": 676 }, { "epoch": 0.01356, "grad_norm": 4.0625, "grad_norm_var": 0.07224934895833333, "learning_rate": 0.0001, "loss": 6.4526, "loss/crossentropy": 2.1924527883529663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38948506116867065, "step": 678 }, { "epoch": 0.0136, "grad_norm": 3.6875, "grad_norm_var": 0.082421875, "learning_rate": 0.0001, "loss": 5.7311, "loss/crossentropy": 2.1603400707244873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40414859354496, "step": 680 }, { "epoch": 0.01364, "grad_norm": 3.71875, "grad_norm_var": 0.07177632649739583, "learning_rate": 0.0001, "loss": 6.2459, "loss/crossentropy": 2.515262722969055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4363710880279541, "step": 682 }, { "epoch": 0.01368, "grad_norm": 4.09375, "grad_norm_var": 0.07869364420572916, "learning_rate": 0.0001, "loss": 6.1174, "loss/crossentropy": 2.3615161180496216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35220713913440704, "step": 684 }, { "epoch": 0.01372, "grad_norm": 6.03125, "grad_norm_var": 0.3293690999348958, "learning_rate": 0.0001, "loss": 6.1941, "loss/crossentropy": 1.920493245124817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35293935239315033, "step": 686 }, { "epoch": 0.01376, "grad_norm": 4.03125, "grad_norm_var": 0.31579488118489585, "learning_rate": 0.0001, "loss": 6.4064, "loss/crossentropy": 2.493665337562561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4243907481431961, "step": 688 }, { "epoch": 0.0138, "grad_norm": 4.03125, "grad_norm_var": 0.3451324462890625, "learning_rate": 0.0001, "loss": 6.1519, "loss/crossentropy": 2.1182271242141724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39624081552028656, "step": 690 }, { "epoch": 0.01384, "grad_norm": 4.09375, "grad_norm_var": 0.32034098307291664, "learning_rate": 0.0001, "loss": 6.2457, "loss/crossentropy": 2.146227180957794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3491668850183487, "step": 692 }, { "epoch": 0.01388, "grad_norm": 3.734375, "grad_norm_var": 0.3376261393229167, "learning_rate": 0.0001, "loss": 5.7331, "loss/crossentropy": 1.8458876609802246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34608474373817444, "step": 694 }, { "epoch": 0.01392, "grad_norm": 4.03125, "grad_norm_var": 0.3294911702473958, "learning_rate": 0.0001, "loss": 6.3661, "loss/crossentropy": 2.270371675491333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33937984704971313, "step": 696 }, { "epoch": 0.01396, "grad_norm": 3.875, "grad_norm_var": 0.32066141764322914, "learning_rate": 0.0001, "loss": 5.8264, "loss/crossentropy": 2.040702223777771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3821101486682892, "step": 698 }, { "epoch": 0.014, "grad_norm": 4.09375, "grad_norm_var": 0.3197428385416667, "learning_rate": 0.0001, "loss": 6.1216, "loss/crossentropy": 2.1711822152137756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35383065044879913, "step": 700 }, { "epoch": 0.01404, "grad_norm": 3.890625, "grad_norm_var": 0.0597076416015625, "learning_rate": 0.0001, "loss": 5.8384, "loss/crossentropy": 2.3292651176452637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37580642104148865, "step": 702 }, { "epoch": 0.01408, "grad_norm": 4.34375, "grad_norm_var": 0.06575520833333333, "learning_rate": 0.0001, "loss": 6.0789, "loss/crossentropy": 2.243735432624817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3997037708759308, "step": 704 }, { "epoch": 0.01412, "grad_norm": 3.921875, "grad_norm_var": 0.0524566650390625, "learning_rate": 0.0001, "loss": 5.9987, "loss/crossentropy": 1.9908145666122437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33391132950782776, "step": 706 }, { "epoch": 0.01416, "grad_norm": 4.375, "grad_norm_var": 0.06243082682291667, "learning_rate": 0.0001, "loss": 5.536, "loss/crossentropy": 2.280096471309662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3899015784263611, "step": 708 }, { "epoch": 0.0142, "grad_norm": 3.6875, "grad_norm_var": 0.12727762858072916, "learning_rate": 0.0001, "loss": 5.9373, "loss/crossentropy": 2.0714810490608215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797626197338104, "step": 710 }, { "epoch": 0.01424, "grad_norm": 3.875, "grad_norm_var": 0.14480692545572918, "learning_rate": 0.0001, "loss": 6.2466, "loss/crossentropy": 2.250674605369568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3948116898536682, "step": 712 }, { "epoch": 0.01428, "grad_norm": 3.65625, "grad_norm_var": 0.15719401041666667, "learning_rate": 0.0001, "loss": 5.8891, "loss/crossentropy": 2.0895228385925293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34471337497234344, "step": 714 }, { "epoch": 0.01432, "grad_norm": 4.71875, "grad_norm_var": 0.1826812744140625, "learning_rate": 0.0001, "loss": 6.3748, "loss/crossentropy": 2.337170124053955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4226381927728653, "step": 716 }, { "epoch": 0.01436, "grad_norm": 4.03125, "grad_norm_var": 0.174462890625, "learning_rate": 0.0001, "loss": 6.3996, "loss/crossentropy": 2.337436556816101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3738469183444977, "step": 718 }, { "epoch": 0.0144, "grad_norm": 4.15625, "grad_norm_var": 0.1669921875, "learning_rate": 0.0001, "loss": 6.0278, "loss/crossentropy": 1.9506489634513855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37431904673576355, "step": 720 }, { "epoch": 0.01444, "grad_norm": 3.71875, "grad_norm_var": 0.17283528645833332, "learning_rate": 0.0001, "loss": 5.8083, "loss/crossentropy": 2.253044009208679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36500048637390137, "step": 722 }, { "epoch": 0.01448, "grad_norm": 5.15625, "grad_norm_var": 0.23413798014322917, "learning_rate": 0.0001, "loss": 6.2307, "loss/crossentropy": 1.9980219006538391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34674490988254547, "step": 724 }, { "epoch": 0.01452, "grad_norm": 3.609375, "grad_norm_var": 0.1903472900390625, "learning_rate": 0.0001, "loss": 5.8283, "loss/crossentropy": 1.894662618637085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34921175241470337, "step": 726 }, { "epoch": 0.01456, "grad_norm": 3.609375, "grad_norm_var": 0.18310546875, "learning_rate": 0.0001, "loss": 6.1117, "loss/crossentropy": 2.304685115814209, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36858032643795013, "step": 728 }, { "epoch": 0.0146, "grad_norm": 3.6875, "grad_norm_var": 0.17696940104166667, "learning_rate": 0.0001, "loss": 6.1041, "loss/crossentropy": 1.9020891189575195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3854057639837265, "step": 730 }, { "epoch": 0.01464, "grad_norm": 3.4375, "grad_norm_var": 0.15485026041666666, "learning_rate": 0.0001, "loss": 5.5327, "loss/crossentropy": 1.707019329071045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2961048036813736, "step": 732 }, { "epoch": 0.01468, "grad_norm": 3.828125, "grad_norm_var": 0.15344136555989582, "learning_rate": 0.0001, "loss": 6.0543, "loss/crossentropy": 2.423463463783264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3739102631807327, "step": 734 }, { "epoch": 0.01472, "grad_norm": 4.0625, "grad_norm_var": 0.15335286458333333, "learning_rate": 0.0001, "loss": 6.2855, "loss/crossentropy": 2.0490055680274963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4027387350797653, "step": 736 }, { "epoch": 0.01476, "grad_norm": 3.984375, "grad_norm_var": 0.15038960774739582, "learning_rate": 0.0001, "loss": 6.1236, "loss/crossentropy": 2.3712635040283203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3602859079837799, "step": 738 }, { "epoch": 0.0148, "grad_norm": 3.75, "grad_norm_var": 0.058649698893229164, "learning_rate": 0.0001, "loss": 6.2938, "loss/crossentropy": 2.306379556655884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38698625564575195, "step": 740 }, { "epoch": 0.01484, "grad_norm": 3.984375, "grad_norm_var": 0.055939737955729166, "learning_rate": 0.0001, "loss": 6.3983, "loss/crossentropy": 2.6846178770065308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3881431221961975, "step": 742 }, { "epoch": 0.01488, "grad_norm": 3.9375, "grad_norm_var": 0.058690388997395836, "learning_rate": 0.0001, "loss": 5.7984, "loss/crossentropy": 2.0555977821350098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36078818142414093, "step": 744 }, { "epoch": 0.01492, "grad_norm": 4.09375, "grad_norm_var": 0.059325154622395834, "learning_rate": 0.0001, "loss": 5.7834, "loss/crossentropy": 2.0597304701805115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34516778588294983, "step": 746 }, { "epoch": 0.01496, "grad_norm": 3.5625, "grad_norm_var": 0.049479166666666664, "learning_rate": 0.0001, "loss": 5.8911, "loss/crossentropy": 1.9607329964637756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33818933367729187, "step": 748 }, { "epoch": 0.015, "grad_norm": 3.546875, "grad_norm_var": 0.057291666666666664, "learning_rate": 0.0001, "loss": 5.8484, "loss/crossentropy": 1.7854246497154236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3463115990161896, "step": 750 }, { "epoch": 0.01504, "grad_norm": 3.703125, "grad_norm_var": 0.0590240478515625, "learning_rate": 0.0001, "loss": 5.7788, "loss/crossentropy": 1.845078468322754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3714388310909271, "step": 752 }, { "epoch": 0.01508, "grad_norm": 3.9375, "grad_norm_var": 0.05771077473958333, "learning_rate": 0.0001, "loss": 6.1677, "loss/crossentropy": 2.3951027393341064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3723580837249756, "step": 754 }, { "epoch": 0.01512, "grad_norm": 3.6875, "grad_norm_var": 0.032486979166666666, "learning_rate": 0.0001, "loss": 6.219, "loss/crossentropy": 2.500870108604431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42450079321861267, "step": 756 }, { "epoch": 0.01516, "grad_norm": 3.75, "grad_norm_var": 0.030060831705729166, "learning_rate": 0.0001, "loss": 5.886, "loss/crossentropy": 2.1584274768829346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3798908591270447, "step": 758 }, { "epoch": 0.0152, "grad_norm": 3.5625, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 5.7937, "loss/crossentropy": 2.3126983642578125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33131279051303864, "step": 760 }, { "epoch": 0.01524, "grad_norm": 3.5, "grad_norm_var": 0.020246378580729165, "learning_rate": 0.0001, "loss": 5.9182, "loss/crossentropy": 2.349764347076416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3910932093858719, "step": 762 }, { "epoch": 0.01528, "grad_norm": 3.640625, "grad_norm_var": 0.0224273681640625, "learning_rate": 0.0001, "loss": 6.0472, "loss/crossentropy": 2.2232795357704163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3483322113752365, "step": 764 }, { "epoch": 0.01532, "grad_norm": 3.65625, "grad_norm_var": 0.0207427978515625, "learning_rate": 0.0001, "loss": 6.0312, "loss/crossentropy": 2.2273412942886353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38545119762420654, "step": 766 }, { "epoch": 0.01536, "grad_norm": 4.125, "grad_norm_var": 0.03355712890625, "learning_rate": 0.0001, "loss": 6.0523, "loss/crossentropy": 2.5879149436950684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38191574811935425, "step": 768 }, { "epoch": 0.0154, "grad_norm": 3.953125, "grad_norm_var": 0.034032185872395836, "learning_rate": 0.0001, "loss": 6.027, "loss/crossentropy": 2.3305420875549316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3704134076833725, "step": 770 }, { "epoch": 0.01544, "grad_norm": 3.59375, "grad_norm_var": 0.03241780598958333, "learning_rate": 0.0001, "loss": 6.121, "loss/crossentropy": 2.0433666706085205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34387587010860443, "step": 772 }, { "epoch": 0.01548, "grad_norm": 3.609375, "grad_norm_var": 0.03337300618489583, "learning_rate": 0.0001, "loss": 5.5837, "loss/crossentropy": 2.1127337217330933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3484109789133072, "step": 774 }, { "epoch": 0.01552, "grad_norm": 3.859375, "grad_norm_var": 0.029002888997395834, "learning_rate": 0.0001, "loss": 6.0028, "loss/crossentropy": 2.1637459993362427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3757011145353317, "step": 776 }, { "epoch": 0.01556, "grad_norm": 3.734375, "grad_norm_var": 0.024446614583333335, "learning_rate": 0.0001, "loss": 5.9904, "loss/crossentropy": 2.4118471145629883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797076344490051, "step": 778 }, { "epoch": 0.0156, "grad_norm": 3.828125, "grad_norm_var": 0.022191365559895832, "learning_rate": 0.0001, "loss": 6.2649, "loss/crossentropy": 1.9410768151283264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3254295587539673, "step": 780 }, { "epoch": 0.01564, "grad_norm": 3.609375, "grad_norm_var": 0.023558553059895834, "learning_rate": 0.0001, "loss": 5.9008, "loss/crossentropy": 2.1669737100601196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3583529591560364, "step": 782 }, { "epoch": 0.01568, "grad_norm": 3.5625, "grad_norm_var": 0.017606608072916665, "learning_rate": 0.0001, "loss": 5.9868, "loss/crossentropy": 2.217113733291626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3503105044364929, "step": 784 }, { "epoch": 0.01572, "grad_norm": 3.84375, "grad_norm_var": 0.017154947916666666, "learning_rate": 0.0001, "loss": 6.0695, "loss/crossentropy": 2.588438868522644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3941201716661453, "step": 786 }, { "epoch": 0.01576, "grad_norm": 3.546875, "grad_norm_var": 0.018310546875, "learning_rate": 0.0001, "loss": 5.9436, "loss/crossentropy": 2.3925808668136597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3755808621644974, "step": 788 }, { "epoch": 0.0158, "grad_norm": 3.625, "grad_norm_var": 0.018684895833333333, "learning_rate": 0.0001, "loss": 5.7254, "loss/crossentropy": 1.9568504691123962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3865346759557724, "step": 790 }, { "epoch": 0.01584, "grad_norm": 3.96875, "grad_norm_var": 0.024104817708333334, "learning_rate": 0.0001, "loss": 6.0174, "loss/crossentropy": 2.336462616920471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36937348544597626, "step": 792 }, { "epoch": 0.01588, "grad_norm": 3.421875, "grad_norm_var": 0.03623046875, "learning_rate": 0.0001, "loss": 5.7742, "loss/crossentropy": 2.1867082715034485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35803911089897156, "step": 794 }, { "epoch": 0.01592, "grad_norm": 3.765625, "grad_norm_var": 0.03877665201822917, "learning_rate": 0.0001, "loss": 6.2825, "loss/crossentropy": 2.070562243461609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42781224846839905, "step": 796 }, { "epoch": 0.01596, "grad_norm": 4.09375, "grad_norm_var": 0.044384765625, "learning_rate": 0.0001, "loss": 6.401, "loss/crossentropy": 2.160820960998535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34495553374290466, "step": 798 }, { "epoch": 0.016, "grad_norm": 4.0, "grad_norm_var": 0.045426432291666666, "learning_rate": 0.0001, "loss": 5.9628, "loss/crossentropy": 2.3424230813980103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4417698383331299, "step": 800 }, { "epoch": 0.01604, "grad_norm": 4.59375, "grad_norm_var": 0.08385009765625, "learning_rate": 0.0001, "loss": 6.1984, "loss/crossentropy": 2.090175747871399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35883304476737976, "step": 802 }, { "epoch": 0.01608, "grad_norm": 3.90625, "grad_norm_var": 0.0759429931640625, "learning_rate": 0.0001, "loss": 6.2044, "loss/crossentropy": 2.460660457611084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36723683774471283, "step": 804 }, { "epoch": 0.01612, "grad_norm": 3.78125, "grad_norm_var": 0.0783203125, "learning_rate": 0.0001, "loss": 5.8788, "loss/crossentropy": 2.2680885791778564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3925183415412903, "step": 806 }, { "epoch": 0.01616, "grad_norm": 3.796875, "grad_norm_var": 0.10422261555989583, "learning_rate": 0.0001, "loss": 6.1179, "loss/crossentropy": 2.272566020488739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3425859659910202, "step": 808 }, { "epoch": 0.0162, "grad_norm": 3.546875, "grad_norm_var": 0.10061442057291667, "learning_rate": 0.0001, "loss": 5.8933, "loss/crossentropy": 2.2417107820510864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3634066879749298, "step": 810 }, { "epoch": 0.01624, "grad_norm": 4.09375, "grad_norm_var": 0.10075581868489583, "learning_rate": 0.0001, "loss": 5.9907, "loss/crossentropy": 2.2117987275123596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3669978231191635, "step": 812 }, { "epoch": 0.01628, "grad_norm": 4.53125, "grad_norm_var": 0.12078348795572917, "learning_rate": 0.0001, "loss": 6.1767, "loss/crossentropy": 2.3471380472183228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39207911491394043, "step": 814 }, { "epoch": 0.01632, "grad_norm": 4.1875, "grad_norm_var": 0.12200113932291666, "learning_rate": 0.0001, "loss": 6.005, "loss/crossentropy": 2.1516740322113037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3394138962030411, "step": 816 }, { "epoch": 0.01636, "grad_norm": 3.765625, "grad_norm_var": 0.10528055826822917, "learning_rate": 0.0001, "loss": 6.0827, "loss/crossentropy": 2.5085272789001465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39268872141838074, "step": 818 }, { "epoch": 0.0164, "grad_norm": 3.515625, "grad_norm_var": 0.1198883056640625, "learning_rate": 0.0001, "loss": 6.0363, "loss/crossentropy": 2.3051916360855103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3798936903476715, "step": 820 }, { "epoch": 0.01644, "grad_norm": 3.5, "grad_norm_var": 0.14094136555989584, "learning_rate": 0.0001, "loss": 5.4403, "loss/crossentropy": 2.1685640811920166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3666132390499115, "step": 822 }, { "epoch": 0.01648, "grad_norm": 3.53125, "grad_norm_var": 0.109521484375, "learning_rate": 0.0001, "loss": 5.6981, "loss/crossentropy": 2.3374987840652466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32505205273628235, "step": 824 }, { "epoch": 0.01652, "grad_norm": 5.4375, "grad_norm_var": 0.280859375, "learning_rate": 0.0001, "loss": 6.1428, "loss/crossentropy": 2.6427528858184814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4296632409095764, "step": 826 }, { "epoch": 0.01656, "grad_norm": 4.0, "grad_norm_var": 0.28609619140625, "learning_rate": 0.0001, "loss": 5.8304, "loss/crossentropy": 2.1534847617149353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37605202198028564, "step": 828 }, { "epoch": 0.0166, "grad_norm": 4.28125, "grad_norm_var": 0.2637278238932292, "learning_rate": 0.0001, "loss": 6.2115, "loss/crossentropy": 1.9642478227615356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43740569055080414, "step": 830 }, { "epoch": 0.01664, "grad_norm": 4.59375, "grad_norm_var": 3.3863433837890624, "learning_rate": 0.0001, "loss": 6.5306, "loss/crossentropy": 2.283148407936096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3901669532060623, "step": 832 }, { "epoch": 0.01668, "grad_norm": 3.46875, "grad_norm_var": 3.382255045572917, "learning_rate": 0.0001, "loss": 6.0831, "loss/crossentropy": 2.418351888656616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4011431038379669, "step": 834 }, { "epoch": 0.01672, "grad_norm": 4.03125, "grad_norm_var": 3.322565714518229, "learning_rate": 0.0001, "loss": 6.1852, "loss/crossentropy": 2.40928852558136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40596309304237366, "step": 836 }, { "epoch": 0.01676, "grad_norm": 3.34375, "grad_norm_var": 3.3059234619140625, "learning_rate": 0.0001, "loss": 5.791, "loss/crossentropy": 2.4211392402648926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3640473484992981, "step": 838 }, { "epoch": 0.0168, "grad_norm": 3.484375, "grad_norm_var": 3.2890625, "learning_rate": 0.0001, "loss": 5.6553, "loss/crossentropy": 2.047215461730957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3346950262784958, "step": 840 }, { "epoch": 0.01684, "grad_norm": 3.578125, "grad_norm_var": 3.28623046875, "learning_rate": 0.0001, "loss": 5.7093, "loss/crossentropy": 2.020021378993988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3442958742380142, "step": 842 }, { "epoch": 0.01688, "grad_norm": 3.96875, "grad_norm_var": 3.27164306640625, "learning_rate": 0.0001, "loss": 6.1484, "loss/crossentropy": 2.1585127115249634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3768642693758011, "step": 844 }, { "epoch": 0.01692, "grad_norm": 3.84375, "grad_norm_var": 3.298802693684896, "learning_rate": 0.0001, "loss": 5.8512, "loss/crossentropy": 2.2717286348342896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3644135594367981, "step": 846 }, { "epoch": 0.01696, "grad_norm": 3.828125, "grad_norm_var": 0.06539713541666667, "learning_rate": 0.0001, "loss": 5.9203, "loss/crossentropy": 2.2683321237564087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3795373737812042, "step": 848 }, { "epoch": 0.017, "grad_norm": 3.1875, "grad_norm_var": 0.07066141764322917, "learning_rate": 0.0001, "loss": 5.7235, "loss/crossentropy": 2.1189464330673218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35512739419937134, "step": 850 }, { "epoch": 0.01704, "grad_norm": 3.65625, "grad_norm_var": 0.060384114583333336, "learning_rate": 0.0001, "loss": 5.6536, "loss/crossentropy": 2.260777235031128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35152101516723633, "step": 852 }, { "epoch": 0.01708, "grad_norm": 3.671875, "grad_norm_var": 0.06897379557291666, "learning_rate": 0.0001, "loss": 5.93, "loss/crossentropy": 2.323577642440796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32252687215805054, "step": 854 }, { "epoch": 0.01712, "grad_norm": 3.71875, "grad_norm_var": 0.07024637858072917, "learning_rate": 0.0001, "loss": 6.1006, "loss/crossentropy": 2.5965300798416138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43394728004932404, "step": 856 }, { "epoch": 0.01716, "grad_norm": 3.140625, "grad_norm_var": 0.08170572916666667, "learning_rate": 0.0001, "loss": 5.8612, "loss/crossentropy": 2.078580856323242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3437638282775879, "step": 858 }, { "epoch": 0.0172, "grad_norm": 3.359375, "grad_norm_var": 0.08163960774739583, "learning_rate": 0.0001, "loss": 5.8394, "loss/crossentropy": 2.425456404685974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3522993326187134, "step": 860 }, { "epoch": 0.01724, "grad_norm": 3.515625, "grad_norm_var": 0.08478190104166666, "learning_rate": 0.0001, "loss": 6.0154, "loss/crossentropy": 2.2830835580825806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37188920378685, "step": 862 }, { "epoch": 0.01728, "grad_norm": 3.8125, "grad_norm_var": 0.06896870930989583, "learning_rate": 0.0001, "loss": 5.9086, "loss/crossentropy": 2.090674340724945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32989686727523804, "step": 864 }, { "epoch": 0.01732, "grad_norm": 3.8125, "grad_norm_var": 0.0698394775390625, "learning_rate": 0.0001, "loss": 5.9617, "loss/crossentropy": 2.304458498954773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37342821061611176, "step": 866 }, { "epoch": 0.01736, "grad_norm": 3.421875, "grad_norm_var": 0.0715484619140625, "learning_rate": 0.0001, "loss": 5.8593, "loss/crossentropy": 2.6545844078063965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3627774566411972, "step": 868 }, { "epoch": 0.0174, "grad_norm": 3.3125, "grad_norm_var": 0.059789021809895836, "learning_rate": 0.0001, "loss": 5.6956, "loss/crossentropy": 1.9977945685386658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.318715900182724, "step": 870 }, { "epoch": 0.01744, "grad_norm": 3.203125, "grad_norm_var": 0.08033854166666667, "learning_rate": 0.0001, "loss": 5.7408, "loss/crossentropy": 1.9226595759391785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3187277615070343, "step": 872 }, { "epoch": 0.01748, "grad_norm": 3.6875, "grad_norm_var": 0.0694732666015625, "learning_rate": 0.0001, "loss": 5.9863, "loss/crossentropy": 2.323302686214447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36724327504634857, "step": 874 }, { "epoch": 0.01752, "grad_norm": 3.5625, "grad_norm_var": 0.07043355305989583, "learning_rate": 0.0001, "loss": 5.9913, "loss/crossentropy": 2.254343032836914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3444042354822159, "step": 876 }, { "epoch": 0.01756, "grad_norm": 3.296875, "grad_norm_var": 0.0774078369140625, "learning_rate": 0.0001, "loss": 5.5038, "loss/crossentropy": 2.0819836854934692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3477388769388199, "step": 878 }, { "epoch": 0.0176, "grad_norm": 3.703125, "grad_norm_var": 0.10198160807291666, "learning_rate": 0.0001, "loss": 5.9947, "loss/crossentropy": 2.377693295478821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36597058176994324, "step": 880 }, { "epoch": 0.01764, "grad_norm": 4.21875, "grad_norm_var": 0.11789449055989583, "learning_rate": 0.0001, "loss": 6.3011, "loss/crossentropy": 2.5598798990249634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4082639515399933, "step": 882 }, { "epoch": 0.01768, "grad_norm": 3.6875, "grad_norm_var": 0.1294830322265625, "learning_rate": 0.0001, "loss": 5.9966, "loss/crossentropy": 2.2847843170166016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3531750440597534, "step": 884 }, { "epoch": 0.01772, "grad_norm": 3.390625, "grad_norm_var": 0.12878316243489582, "learning_rate": 0.0001, "loss": 5.6685, "loss/crossentropy": 1.8283140063285828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.336555078625679, "step": 886 }, { "epoch": 0.01776, "grad_norm": 3.65625, "grad_norm_var": 0.09971415201822917, "learning_rate": 0.0001, "loss": 5.9507, "loss/crossentropy": 2.1001436710357666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3380406051874161, "step": 888 }, { "epoch": 0.0178, "grad_norm": 4.21875, "grad_norm_var": 0.11876627604166666, "learning_rate": 0.0001, "loss": 5.7552, "loss/crossentropy": 2.0079030990600586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35362809896469116, "step": 890 }, { "epoch": 0.01784, "grad_norm": 4.46875, "grad_norm_var": 0.15650634765625, "learning_rate": 0.0001, "loss": 5.7416, "loss/crossentropy": 2.176286220550537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34369874000549316, "step": 892 }, { "epoch": 0.01788, "grad_norm": 3.984375, "grad_norm_var": 0.13087565104166668, "learning_rate": 0.0001, "loss": 5.9236, "loss/crossentropy": 2.17675244808197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3495863378047943, "step": 894 }, { "epoch": 0.01792, "grad_norm": 4.0, "grad_norm_var": 0.12189127604166666, "learning_rate": 0.0001, "loss": 5.9109, "loss/crossentropy": 2.312318801879883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40353919565677643, "step": 896 }, { "epoch": 0.01796, "grad_norm": 3.375, "grad_norm_var": 0.14010009765625, "learning_rate": 0.0001, "loss": 6.071, "loss/crossentropy": 2.28923499584198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3770768642425537, "step": 898 }, { "epoch": 0.018, "grad_norm": 3.546875, "grad_norm_var": 0.1447662353515625, "learning_rate": 0.0001, "loss": 6.0275, "loss/crossentropy": 2.2060720920562744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35431359708309174, "step": 900 }, { "epoch": 0.01804, "grad_norm": 3.296875, "grad_norm_var": 0.17552083333333332, "learning_rate": 0.0001, "loss": 5.4052, "loss/crossentropy": 2.0325206518173218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3037416934967041, "step": 902 }, { "epoch": 0.01808, "grad_norm": 3.546875, "grad_norm_var": 0.17635091145833334, "learning_rate": 0.0001, "loss": 5.9148, "loss/crossentropy": 2.1943042278289795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3509814143180847, "step": 904 }, { "epoch": 0.01812, "grad_norm": 3.4375, "grad_norm_var": 0.1696197509765625, "learning_rate": 0.0001, "loss": 5.563, "loss/crossentropy": 1.9589214324951172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31621459126472473, "step": 906 }, { "epoch": 0.01816, "grad_norm": 3.375, "grad_norm_var": 0.12841389973958334, "learning_rate": 0.0001, "loss": 5.6511, "loss/crossentropy": 2.329489588737488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3837278485298157, "step": 908 }, { "epoch": 0.0182, "grad_norm": 3.5625, "grad_norm_var": 0.09648030598958333, "learning_rate": 0.0001, "loss": 5.8082, "loss/crossentropy": 2.1757726669311523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3659580200910568, "step": 910 }, { "epoch": 0.01824, "grad_norm": 3.75, "grad_norm_var": 0.08772786458333333, "learning_rate": 0.0001, "loss": 5.7372, "loss/crossentropy": 2.1498661041259766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3474857211112976, "step": 912 }, { "epoch": 0.01828, "grad_norm": 15.8125, "grad_norm_var": 9.504325358072917, "learning_rate": 0.0001, "loss": 5.9297, "loss/crossentropy": 2.4722740650177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.505307987332344, "step": 914 }, { "epoch": 0.01832, "grad_norm": 9.0, "grad_norm_var": 10.75227762858073, "learning_rate": 0.0001, "loss": 5.6296, "loss/crossentropy": 1.855428695678711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31347331404685974, "step": 916 }, { "epoch": 0.01836, "grad_norm": 3.75, "grad_norm_var": 10.50523173014323, "learning_rate": 0.0001, "loss": 5.9769, "loss/crossentropy": 2.326256275177002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3996751010417938, "step": 918 }, { "epoch": 0.0184, "grad_norm": 3.546875, "grad_norm_var": 10.518257649739583, "learning_rate": 0.0001, "loss": 5.8444, "loss/crossentropy": 2.3712844848632812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37993232905864716, "step": 920 }, { "epoch": 0.01844, "grad_norm": 3.5, "grad_norm_var": 10.657861328125, "learning_rate": 0.0001, "loss": 5.5577, "loss/crossentropy": 2.0161430835723877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3590858578681946, "step": 922 }, { "epoch": 0.01848, "grad_norm": 5.40625, "grad_norm_var": 10.520926920572917, "learning_rate": 0.0001, "loss": 5.6675, "loss/crossentropy": 2.2401121258735657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33380126953125, "step": 924 }, { "epoch": 0.01852, "grad_norm": 3.328125, "grad_norm_var": 10.598356119791667, "learning_rate": 0.0001, "loss": 5.9331, "loss/crossentropy": 2.2354423999786377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34832654893398285, "step": 926 }, { "epoch": 0.01856, "grad_norm": 3.46875, "grad_norm_var": 10.647850545247396, "learning_rate": 0.0001, "loss": 5.5212, "loss/crossentropy": 2.158566176891327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3664778769016266, "step": 928 }, { "epoch": 0.0186, "grad_norm": 3.703125, "grad_norm_var": 2.1230377197265624, "learning_rate": 0.0001, "loss": 5.9146, "loss/crossentropy": 2.270231008529663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35586032271385193, "step": 930 }, { "epoch": 0.01864, "grad_norm": 3.609375, "grad_norm_var": 0.3744303385416667, "learning_rate": 0.0001, "loss": 6.0013, "loss/crossentropy": 2.233540892601013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3789799362421036, "step": 932 }, { "epoch": 0.01868, "grad_norm": 3.65625, "grad_norm_var": 0.27898763020833334, "learning_rate": 0.0001, "loss": 5.5019, "loss/crossentropy": 1.9381731152534485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28776855766773224, "step": 934 }, { "epoch": 0.01872, "grad_norm": 3.34375, "grad_norm_var": 0.28227437337239586, "learning_rate": 0.0001, "loss": 5.6759, "loss/crossentropy": 2.4225244522094727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.355392187833786, "step": 936 }, { "epoch": 0.01876, "grad_norm": 3.15625, "grad_norm_var": 0.28084208170572916, "learning_rate": 0.0001, "loss": 5.911, "loss/crossentropy": 2.58090603351593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3640855699777603, "step": 938 }, { "epoch": 0.0188, "grad_norm": 3.5625, "grad_norm_var": 0.03673502604166667, "learning_rate": 0.0001, "loss": 6.0623, "loss/crossentropy": 2.436452269554138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797999918460846, "step": 940 }, { "epoch": 0.01884, "grad_norm": 3.3125, "grad_norm_var": 0.03680013020833333, "learning_rate": 0.0001, "loss": 5.7428, "loss/crossentropy": 2.0378769636154175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3278265744447708, "step": 942 }, { "epoch": 0.01888, "grad_norm": 3.546875, "grad_norm_var": 0.03860270182291667, "learning_rate": 0.0001, "loss": 5.6211, "loss/crossentropy": 2.1212962865829468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34586282074451447, "step": 944 }, { "epoch": 0.01892, "grad_norm": 3.515625, "grad_norm_var": 0.040087890625, "learning_rate": 0.0001, "loss": 5.6695, "loss/crossentropy": 2.1884353160858154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40608666837215424, "step": 946 }, { "epoch": 0.01896, "grad_norm": 3.671875, "grad_norm_var": 0.046858723958333334, "learning_rate": 0.0001, "loss": 5.6684, "loss/crossentropy": 2.2093260288238525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3519093841314316, "step": 948 }, { "epoch": 0.019, "grad_norm": 3.328125, "grad_norm_var": 0.03243815104166667, "learning_rate": 0.0001, "loss": 6.0842, "loss/crossentropy": 2.4246588945388794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37247334420681, "step": 950 }, { "epoch": 0.01904, "grad_norm": 3.671875, "grad_norm_var": 0.031022135416666666, "learning_rate": 0.0001, "loss": 5.6116, "loss/crossentropy": 1.932490050792694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32985979318618774, "step": 952 }, { "epoch": 0.01908, "grad_norm": 3.90625, "grad_norm_var": 0.7084920247395833, "learning_rate": 0.0001, "loss": 5.7393, "loss/crossentropy": 2.4439035654067993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38164034485816956, "step": 954 }, { "epoch": 0.01912, "grad_norm": 3.296875, "grad_norm_var": 0.72437744140625, "learning_rate": 0.0001, "loss": 5.6255, "loss/crossentropy": 1.8876591920852661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3267661929130554, "step": 956 }, { "epoch": 0.01916, "grad_norm": 3.578125, "grad_norm_var": 0.6990193684895833, "learning_rate": 0.0001, "loss": 5.7367, "loss/crossentropy": 2.284990072250366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34810060262680054, "step": 958 }, { "epoch": 0.0192, "grad_norm": 3.53125, "grad_norm_var": 0.6961008707682291, "learning_rate": 0.0001, "loss": 5.888, "loss/crossentropy": 2.333263397216797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42767176032066345, "step": 960 }, { "epoch": 0.01924, "grad_norm": 3.296875, "grad_norm_var": 0.7323527018229167, "learning_rate": 0.0001, "loss": 5.6021, "loss/crossentropy": 2.2526148557662964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3284706473350525, "step": 962 }, { "epoch": 0.01928, "grad_norm": 4.25, "grad_norm_var": 0.7586252848307292, "learning_rate": 0.0001, "loss": 5.5479, "loss/crossentropy": 2.1782984137535095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35957905650138855, "step": 964 }, { "epoch": 0.01932, "grad_norm": 4.28125, "grad_norm_var": 0.75205078125, "learning_rate": 0.0001, "loss": 6.2783, "loss/crossentropy": 2.292098045349121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35862940549850464, "step": 966 }, { "epoch": 0.01936, "grad_norm": 3.546875, "grad_norm_var": 0.7503214518229167, "learning_rate": 0.0001, "loss": 5.9252, "loss/crossentropy": 2.102781653404236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3527261018753052, "step": 968 }, { "epoch": 0.0194, "grad_norm": 3.453125, "grad_norm_var": 0.11111653645833333, "learning_rate": 0.0001, "loss": 5.8891, "loss/crossentropy": 2.223380208015442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3608042299747467, "step": 970 }, { "epoch": 0.01944, "grad_norm": 3.4375, "grad_norm_var": 0.10501200358072917, "learning_rate": 0.0001, "loss": 5.3348, "loss/crossentropy": 2.0684096813201904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31996411085128784, "step": 972 }, { "epoch": 0.01948, "grad_norm": 4.15625, "grad_norm_var": 0.126708984375, "learning_rate": 0.0001, "loss": 5.6642, "loss/crossentropy": 2.2011090517044067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34404022991657257, "step": 974 }, { "epoch": 0.01952, "grad_norm": 3.140625, "grad_norm_var": 0.14937235514322916, "learning_rate": 0.0001, "loss": 5.5033, "loss/crossentropy": 2.027641534805298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31718479096889496, "step": 976 }, { "epoch": 0.01956, "grad_norm": 3.484375, "grad_norm_var": 0.13909403483072916, "learning_rate": 0.0001, "loss": 5.7294, "loss/crossentropy": 2.311842203140259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3904002010822296, "step": 978 }, { "epoch": 0.0196, "grad_norm": 3.859375, "grad_norm_var": 0.11236572265625, "learning_rate": 0.0001, "loss": 5.4402, "loss/crossentropy": 2.3605271577835083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38789358735084534, "step": 980 }, { "epoch": 0.01964, "grad_norm": 3.765625, "grad_norm_var": 0.095166015625, "learning_rate": 0.0001, "loss": 6.1094, "loss/crossentropy": 2.1687097549438477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3534909188747406, "step": 982 }, { "epoch": 0.01968, "grad_norm": 3.6875, "grad_norm_var": 0.0962554931640625, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.194393038749695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34674669802188873, "step": 984 }, { "epoch": 0.01972, "grad_norm": 3.359375, "grad_norm_var": 0.09602762858072916, "learning_rate": 0.0001, "loss": 5.5513, "loss/crossentropy": 2.1355903148651123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3180558532476425, "step": 986 }, { "epoch": 0.01976, "grad_norm": 3.390625, "grad_norm_var": 0.08559468587239584, "learning_rate": 0.0001, "loss": 5.9431, "loss/crossentropy": 2.2688111066818237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.365144744515419, "step": 988 }, { "epoch": 0.0198, "grad_norm": 3.65625, "grad_norm_var": 0.06303609212239583, "learning_rate": 0.0001, "loss": 5.5117, "loss/crossentropy": 2.423216700553894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34281550347805023, "step": 990 }, { "epoch": 0.01984, "grad_norm": 3.375, "grad_norm_var": 0.0419586181640625, "learning_rate": 0.0001, "loss": 5.4698, "loss/crossentropy": 1.9360128045082092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3343205749988556, "step": 992 }, { "epoch": 0.01988, "grad_norm": 3.421875, "grad_norm_var": 0.04468994140625, "learning_rate": 0.0001, "loss": 5.8128, "loss/crossentropy": 2.181576132774353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35567884147167206, "step": 994 }, { "epoch": 0.01992, "grad_norm": 3.578125, "grad_norm_var": 0.037398274739583334, "learning_rate": 0.0001, "loss": 5.9295, "loss/crossentropy": 2.166663408279419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34105008840560913, "step": 996 }, { "epoch": 0.01996, "grad_norm": 3.203125, "grad_norm_var": 0.027408854166666666, "learning_rate": 0.0001, "loss": 5.5579, "loss/crossentropy": 2.285332202911377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35812389850616455, "step": 998 }, { "epoch": 0.02, "grad_norm": 4.0, "grad_norm_var": 35.396484375, "learning_rate": 0.0001, "loss": 6.3986, "loss/crossentropy": 2.088365077972412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3717931807041168, "step": 1000 }, { "epoch": 0.02004, "grad_norm": 4.25, "grad_norm_var": 35.223714192708336, "learning_rate": 0.0001, "loss": 6.1327, "loss/crossentropy": 2.4051828384399414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.420933797955513, "step": 1002 }, { "epoch": 0.02008, "grad_norm": 3.265625, "grad_norm_var": 35.27108968098958, "learning_rate": 0.0001, "loss": 5.6789, "loss/crossentropy": 2.3092572689056396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37571050226688385, "step": 1004 }, { "epoch": 0.02012, "grad_norm": 3.640625, "grad_norm_var": 35.29562072753906, "learning_rate": 0.0001, "loss": 5.6972, "loss/crossentropy": 2.147248387336731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29974566400051117, "step": 1006 }, { "epoch": 0.02016, "grad_norm": 3.984375, "grad_norm_var": 35.141299438476565, "learning_rate": 0.0001, "loss": 5.988, "loss/crossentropy": 2.3385868668556213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4254964739084244, "step": 1008 }, { "epoch": 0.0202, "grad_norm": 3.625, "grad_norm_var": 35.15125223795573, "learning_rate": 0.0001, "loss": 5.7553, "loss/crossentropy": 2.142681658267975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3764440715312958, "step": 1010 }, { "epoch": 0.02024, "grad_norm": 3.34375, "grad_norm_var": 35.18043619791667, "learning_rate": 0.0001, "loss": 5.5947, "loss/crossentropy": 2.241790771484375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3385400176048279, "step": 1012 }, { "epoch": 0.02028, "grad_norm": 3.859375, "grad_norm_var": 34.836360677083334, "learning_rate": 0.0001, "loss": 6.3228, "loss/crossentropy": 2.1563867330551147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3954617381095886, "step": 1014 }, { "epoch": 0.02032, "grad_norm": 3.484375, "grad_norm_var": 0.09921468098958333, "learning_rate": 0.0001, "loss": 5.4613, "loss/crossentropy": 1.9462800025939941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32294341921806335, "step": 1016 }, { "epoch": 0.02036, "grad_norm": 3.53125, "grad_norm_var": 0.08026936848958334, "learning_rate": 0.0001, "loss": 5.4993, "loss/crossentropy": 1.83676278591156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3142092078924179, "step": 1018 }, { "epoch": 0.0204, "grad_norm": 3.84375, "grad_norm_var": 0.09038798014322917, "learning_rate": 0.0001, "loss": 5.8174, "loss/crossentropy": 1.951962649822235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3889008015394211, "step": 1020 }, { "epoch": 0.02044, "grad_norm": 3.578125, "grad_norm_var": 0.09976806640625, "learning_rate": 0.0001, "loss": 5.7173, "loss/crossentropy": 2.299771785736084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3214885741472244, "step": 1022 }, { "epoch": 0.02048, "grad_norm": 3.1875, "grad_norm_var": 0.09135640462239583, "learning_rate": 0.0001, "loss": 5.2291, "loss/crossentropy": 1.9117569327354431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32827669382095337, "step": 1024 }, { "epoch": 0.02052, "grad_norm": 3.1875, "grad_norm_var": 0.096533203125, "learning_rate": 0.0001, "loss": 5.7974, "loss/crossentropy": 2.484488010406494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34361426532268524, "step": 1026 }, { "epoch": 0.02056, "grad_norm": 3.59375, "grad_norm_var": 0.09973551432291666, "learning_rate": 0.0001, "loss": 5.7044, "loss/crossentropy": 2.3155311346054077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3734404444694519, "step": 1028 }, { "epoch": 0.0206, "grad_norm": 3.125, "grad_norm_var": 0.04983317057291667, "learning_rate": 0.0001, "loss": 5.4202, "loss/crossentropy": 2.081188380718231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3250262886285782, "step": 1030 }, { "epoch": 0.02064, "grad_norm": 3.09375, "grad_norm_var": 0.0574127197265625, "learning_rate": 0.0001, "loss": 5.5885, "loss/crossentropy": 2.044768512248993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34609031677246094, "step": 1032 }, { "epoch": 0.02068, "grad_norm": 4.0, "grad_norm_var": 0.0875640869140625, "learning_rate": 0.0001, "loss": 6.078, "loss/crossentropy": 2.0666560530662537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.365878626704216, "step": 1034 }, { "epoch": 0.02072, "grad_norm": 3.5625, "grad_norm_var": 0.08336588541666666, "learning_rate": 0.0001, "loss": 5.9891, "loss/crossentropy": 2.2933902740478516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3754771202802658, "step": 1036 }, { "epoch": 0.02076, "grad_norm": 3.28125, "grad_norm_var": 0.08640848795572917, "learning_rate": 0.0001, "loss": 5.8105, "loss/crossentropy": 2.28829288482666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3676798492670059, "step": 1038 }, { "epoch": 0.0208, "grad_norm": 3.5, "grad_norm_var": 0.08378499348958333, "learning_rate": 0.0001, "loss": 5.9305, "loss/crossentropy": 2.5891193151474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40094244480133057, "step": 1040 }, { "epoch": 0.02084, "grad_norm": 3.234375, "grad_norm_var": 0.08056233723958334, "learning_rate": 0.0001, "loss": 5.8579, "loss/crossentropy": 2.238967180252075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3226759731769562, "step": 1042 }, { "epoch": 0.02088, "grad_norm": 3.296875, "grad_norm_var": 0.07796223958333333, "learning_rate": 0.0001, "loss": 5.2959, "loss/crossentropy": 2.0116711258888245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3511478453874588, "step": 1044 }, { "epoch": 0.02092, "grad_norm": 3.515625, "grad_norm_var": 0.07344462076822916, "learning_rate": 0.0001, "loss": 5.4885, "loss/crossentropy": 2.4924051761627197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3375450670719147, "step": 1046 }, { "epoch": 0.02096, "grad_norm": 3.234375, "grad_norm_var": 0.059235636393229166, "learning_rate": 0.0001, "loss": 5.6838, "loss/crossentropy": 2.138728439807892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34410610795021057, "step": 1048 }, { "epoch": 0.021, "grad_norm": 3.515625, "grad_norm_var": 0.046122233072916664, "learning_rate": 0.0001, "loss": 5.7371, "loss/crossentropy": 2.3748635053634644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3580169975757599, "step": 1050 }, { "epoch": 0.02104, "grad_norm": 3.125, "grad_norm_var": 0.044310506184895834, "learning_rate": 0.0001, "loss": 5.427, "loss/crossentropy": 2.061552882194519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31069953739643097, "step": 1052 }, { "epoch": 0.02108, "grad_norm": 3.515625, "grad_norm_var": 0.03769124348958333, "learning_rate": 0.0001, "loss": 5.3469, "loss/crossentropy": 2.299555718898773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31592319905757904, "step": 1054 }, { "epoch": 0.02112, "grad_norm": 3.53125, "grad_norm_var": 0.04571940104166667, "learning_rate": 0.0001, "loss": 6.1254, "loss/crossentropy": 2.4866377115249634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37435297667980194, "step": 1056 }, { "epoch": 0.02116, "grad_norm": 3.203125, "grad_norm_var": 0.047684733072916666, "learning_rate": 0.0001, "loss": 5.5676, "loss/crossentropy": 1.8185054063796997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2784232199192047, "step": 1058 }, { "epoch": 0.0212, "grad_norm": 3.3125, "grad_norm_var": 0.04840087890625, "learning_rate": 0.0001, "loss": 5.6184, "loss/crossentropy": 2.215538501739502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3471361994743347, "step": 1060 }, { "epoch": 0.02124, "grad_norm": 3.078125, "grad_norm_var": 0.057616170247395834, "learning_rate": 0.0001, "loss": 5.7615, "loss/crossentropy": 2.6912894248962402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35911867022514343, "step": 1062 }, { "epoch": 0.02128, "grad_norm": 3.125, "grad_norm_var": 0.06670633951822917, "learning_rate": 0.0001, "loss": 5.3436, "loss/crossentropy": 1.9757090210914612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29868973791599274, "step": 1064 }, { "epoch": 0.02132, "grad_norm": 3.21875, "grad_norm_var": 0.052294921875, "learning_rate": 0.0001, "loss": 5.5334, "loss/crossentropy": 2.3396666049957275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3561312407255173, "step": 1066 }, { "epoch": 0.02136, "grad_norm": 3.359375, "grad_norm_var": 0.0477935791015625, "learning_rate": 0.0001, "loss": 5.7564, "loss/crossentropy": 2.3498982191085815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3842166066169739, "step": 1068 }, { "epoch": 0.0214, "grad_norm": 3.234375, "grad_norm_var": 0.03717041015625, "learning_rate": 0.0001, "loss": 5.8936, "loss/crossentropy": 2.037585139274597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3429017663002014, "step": 1070 }, { "epoch": 0.02144, "grad_norm": 3.484375, "grad_norm_var": 0.0216949462890625, "learning_rate": 0.0001, "loss": 5.6938, "loss/crossentropy": 2.4804376363754272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35338981449604034, "step": 1072 }, { "epoch": 0.02148, "grad_norm": 3.375, "grad_norm_var": 0.027448527018229165, "learning_rate": 0.0001, "loss": 5.8146, "loss/crossentropy": 2.5210201740264893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3769296407699585, "step": 1074 }, { "epoch": 0.02152, "grad_norm": 3.40625, "grad_norm_var": 0.03369852701822917, "learning_rate": 0.0001, "loss": 5.7854, "loss/crossentropy": 2.1258187294006348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3353133201599121, "step": 1076 }, { "epoch": 0.02156, "grad_norm": 3.328125, "grad_norm_var": 0.026676432291666666, "learning_rate": 0.0001, "loss": 5.5436, "loss/crossentropy": 2.2107938528060913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34667155146598816, "step": 1078 }, { "epoch": 0.0216, "grad_norm": 3.171875, "grad_norm_var": 0.0194244384765625, "learning_rate": 0.0001, "loss": 5.7639, "loss/crossentropy": 1.9614633321762085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2958581745624542, "step": 1080 }, { "epoch": 0.02164, "grad_norm": 3.515625, "grad_norm_var": 0.0194976806640625, "learning_rate": 0.0001, "loss": 5.8219, "loss/crossentropy": 2.1403249502182007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3220343589782715, "step": 1082 }, { "epoch": 0.02168, "grad_norm": 3.28125, "grad_norm_var": 0.022493489583333335, "learning_rate": 0.0001, "loss": 5.6037, "loss/crossentropy": 1.6533048152923584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2814648747444153, "step": 1084 }, { "epoch": 0.02172, "grad_norm": 3.59375, "grad_norm_var": 0.0232086181640625, "learning_rate": 0.0001, "loss": 5.7731, "loss/crossentropy": 2.69880211353302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3835880607366562, "step": 1086 }, { "epoch": 0.02176, "grad_norm": 3.5, "grad_norm_var": 0.0240631103515625, "learning_rate": 0.0001, "loss": 5.8119, "loss/crossentropy": 2.214504837989807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31686101853847504, "step": 1088 }, { "epoch": 0.0218, "grad_norm": 3.765625, "grad_norm_var": 0.027164713541666666, "learning_rate": 0.0001, "loss": 6.0376, "loss/crossentropy": 2.2377456426620483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3682183176279068, "step": 1090 }, { "epoch": 0.02184, "grad_norm": 3.359375, "grad_norm_var": 0.027799479166666665, "learning_rate": 0.0001, "loss": 5.7387, "loss/crossentropy": 2.0977545976638794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32313986122608185, "step": 1092 }, { "epoch": 0.02188, "grad_norm": 3.125, "grad_norm_var": 0.0371490478515625, "learning_rate": 0.0001, "loss": 5.6407, "loss/crossentropy": 2.1717870235443115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3619600385427475, "step": 1094 }, { "epoch": 0.02192, "grad_norm": 3.546875, "grad_norm_var": 0.03674723307291667, "learning_rate": 0.0001, "loss": 5.554, "loss/crossentropy": 2.2805471420288086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3328556418418884, "step": 1096 }, { "epoch": 0.02196, "grad_norm": 3.71875, "grad_norm_var": 0.04625244140625, "learning_rate": 0.0001, "loss": 5.8009, "loss/crossentropy": 2.034846782684326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3610256612300873, "step": 1098 }, { "epoch": 0.022, "grad_norm": 3.359375, "grad_norm_var": 0.04612528483072917, "learning_rate": 0.0001, "loss": 5.6121, "loss/crossentropy": 2.0208348631858826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3311958611011505, "step": 1100 }, { "epoch": 0.02204, "grad_norm": 3.59375, "grad_norm_var": 0.04737040201822917, "learning_rate": 0.0001, "loss": 5.7135, "loss/crossentropy": 2.1020554900169373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3638540059328079, "step": 1102 }, { "epoch": 0.02208, "grad_norm": 3.46875, "grad_norm_var": 0.046019490559895834, "learning_rate": 0.0001, "loss": 5.5099, "loss/crossentropy": 2.28346848487854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3434429168701172, "step": 1104 }, { "epoch": 0.02212, "grad_norm": 3.21875, "grad_norm_var": 0.1152740478515625, "learning_rate": 0.0001, "loss": 5.6866, "loss/crossentropy": 2.103623867034912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34000229835510254, "step": 1106 }, { "epoch": 0.02216, "grad_norm": 4.1875, "grad_norm_var": 0.14846089680989583, "learning_rate": 0.0001, "loss": 5.5196, "loss/crossentropy": 2.205894947052002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3432965874671936, "step": 1108 }, { "epoch": 0.0222, "grad_norm": 3.109375, "grad_norm_var": 0.16467183430989582, "learning_rate": 0.0001, "loss": 5.6166, "loss/crossentropy": 2.395035982131958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3834904432296753, "step": 1110 }, { "epoch": 0.02224, "grad_norm": 3.296875, "grad_norm_var": 0.16505533854166668, "learning_rate": 0.0001, "loss": 5.9042, "loss/crossentropy": 2.3755353689193726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33256760239601135, "step": 1112 }, { "epoch": 0.02228, "grad_norm": 3.28125, "grad_norm_var": 0.16402079264322916, "learning_rate": 0.0001, "loss": 6.0135, "loss/crossentropy": 2.6754449605941772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3524845540523529, "step": 1114 }, { "epoch": 0.02232, "grad_norm": 3.515625, "grad_norm_var": 0.15730692545572916, "learning_rate": 0.0001, "loss": 5.6448, "loss/crossentropy": 2.2398552894592285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.343609020113945, "step": 1116 }, { "epoch": 0.02236, "grad_norm": 3.3125, "grad_norm_var": 0.16770833333333332, "learning_rate": 0.0001, "loss": 5.3075, "loss/crossentropy": 2.399322271347046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34514427185058594, "step": 1118 }, { "epoch": 0.0224, "grad_norm": 3.3125, "grad_norm_var": 0.17229410807291667, "learning_rate": 0.0001, "loss": 5.8886, "loss/crossentropy": 2.4002050161361694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35784730315208435, "step": 1120 }, { "epoch": 0.02244, "grad_norm": 3.21875, "grad_norm_var": 0.10064188639322917, "learning_rate": 0.0001, "loss": 5.6239, "loss/crossentropy": 2.293683171272278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37052902579307556, "step": 1122 }, { "epoch": 0.02248, "grad_norm": 3.234375, "grad_norm_var": 0.0532379150390625, "learning_rate": 0.0001, "loss": 5.8244, "loss/crossentropy": 2.2177391052246094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3482564836740494, "step": 1124 }, { "epoch": 0.02252, "grad_norm": 4.1875, "grad_norm_var": 0.0637603759765625, "learning_rate": 0.0001, "loss": 5.8959, "loss/crossentropy": 2.288211703300476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3401540517807007, "step": 1126 }, { "epoch": 0.02256, "grad_norm": 3.1875, "grad_norm_var": 0.0933258056640625, "learning_rate": 0.0001, "loss": 5.5054, "loss/crossentropy": 2.1786144971847534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.331302285194397, "step": 1128 }, { "epoch": 0.0226, "grad_norm": 3.5, "grad_norm_var": 0.09163004557291667, "learning_rate": 0.0001, "loss": 5.7604, "loss/crossentropy": 2.0390175580978394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32953669130802155, "step": 1130 }, { "epoch": 0.02264, "grad_norm": 3.5, "grad_norm_var": 0.0921295166015625, "learning_rate": 0.0001, "loss": 5.6952, "loss/crossentropy": 2.188543677330017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3585694134235382, "step": 1132 }, { "epoch": 0.02268, "grad_norm": 3.296875, "grad_norm_var": 0.08284098307291667, "learning_rate": 0.0001, "loss": 5.732, "loss/crossentropy": 2.0731694102287292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3121813088655472, "step": 1134 }, { "epoch": 0.02272, "grad_norm": 3.390625, "grad_norm_var": 0.0794097900390625, "learning_rate": 0.0001, "loss": 5.6306, "loss/crossentropy": 2.144552707672119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3303966820240021, "step": 1136 }, { "epoch": 0.02276, "grad_norm": 3.890625, "grad_norm_var": 0.09900614420572916, "learning_rate": 0.0001, "loss": 5.3794, "loss/crossentropy": 2.1617428064346313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3420899361371994, "step": 1138 }, { "epoch": 0.0228, "grad_norm": 3.109375, "grad_norm_var": 0.12813212076822916, "learning_rate": 0.0001, "loss": 5.2834, "loss/crossentropy": 1.8098865747451782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28376901149749756, "step": 1140 }, { "epoch": 0.02284, "grad_norm": 2.984375, "grad_norm_var": 0.09795633951822917, "learning_rate": 0.0001, "loss": 5.3911, "loss/crossentropy": 2.133797824382782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3587050139904022, "step": 1142 }, { "epoch": 0.02288, "grad_norm": 4.96875, "grad_norm_var": 0.24221903483072918, "learning_rate": 0.0001, "loss": 5.8787, "loss/crossentropy": 2.378090739250183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4244185537099838, "step": 1144 }, { "epoch": 0.02292, "grad_norm": 3.03125, "grad_norm_var": 0.2736887613932292, "learning_rate": 0.0001, "loss": 5.6692, "loss/crossentropy": 2.4442414045333862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3569464683532715, "step": 1146 }, { "epoch": 0.02296, "grad_norm": 3.671875, "grad_norm_var": 0.29189453125, "learning_rate": 0.0001, "loss": 5.5468, "loss/crossentropy": 2.6446973085403442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36859095096588135, "step": 1148 }, { "epoch": 0.023, "grad_norm": 4.4375, "grad_norm_var": 0.3506988525390625, "learning_rate": 0.0001, "loss": 5.9727, "loss/crossentropy": 2.4100207090377808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4082309305667877, "step": 1150 }, { "epoch": 0.02304, "grad_norm": 3.25, "grad_norm_var": 0.3552317301432292, "learning_rate": 0.0001, "loss": 5.3537, "loss/crossentropy": 2.1472485661506653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3299275189638138, "step": 1152 }, { "epoch": 0.02308, "grad_norm": 3.21875, "grad_norm_var": 0.3376373291015625, "learning_rate": 0.0001, "loss": 5.4393, "loss/crossentropy": 2.1891872882843018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.356732040643692, "step": 1154 }, { "epoch": 0.02312, "grad_norm": 3.71875, "grad_norm_var": 0.30201416015625, "learning_rate": 0.0001, "loss": 6.0049, "loss/crossentropy": 2.1432933807373047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.360423281788826, "step": 1156 }, { "epoch": 0.02316, "grad_norm": 3.1875, "grad_norm_var": 0.2923248291015625, "learning_rate": 0.0001, "loss": 5.5541, "loss/crossentropy": 2.176819324493408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.340317040681839, "step": 1158 }, { "epoch": 0.0232, "grad_norm": 2.96875, "grad_norm_var": 0.17407938639322917, "learning_rate": 0.0001, "loss": 5.2312, "loss/crossentropy": 2.325207471847534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3309635668992996, "step": 1160 }, { "epoch": 0.02324, "grad_norm": 3.515625, "grad_norm_var": 0.14537353515625, "learning_rate": 0.0001, "loss": 5.7322, "loss/crossentropy": 2.2536743879318237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31422293186187744, "step": 1162 }, { "epoch": 0.02328, "grad_norm": 3.328125, "grad_norm_var": 0.12657877604166667, "learning_rate": 0.0001, "loss": 5.4354, "loss/crossentropy": 2.180476427078247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2956361174583435, "step": 1164 }, { "epoch": 0.02332, "grad_norm": 3.15625, "grad_norm_var": 0.0527008056640625, "learning_rate": 0.0001, "loss": 5.6067, "loss/crossentropy": 1.995088815689087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2936599552631378, "step": 1166 }, { "epoch": 0.02336, "grad_norm": 3.1875, "grad_norm_var": 0.05378316243489583, "learning_rate": 0.0001, "loss": 5.7304, "loss/crossentropy": 2.2555994987487793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3630402684211731, "step": 1168 }, { "epoch": 0.0234, "grad_norm": 3.296875, "grad_norm_var": 0.05856119791666667, "learning_rate": 0.0001, "loss": 5.1613, "loss/crossentropy": 2.0441415905952454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2960120141506195, "step": 1170 }, { "epoch": 0.02344, "grad_norm": 3.25, "grad_norm_var": 0.0204498291015625, "learning_rate": 0.0001, "loss": 5.2238, "loss/crossentropy": 1.8090497255325317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29750876128673553, "step": 1172 }, { "epoch": 0.02348, "grad_norm": 3.234375, "grad_norm_var": 0.0281158447265625, "learning_rate": 0.0001, "loss": 5.2506, "loss/crossentropy": 2.2780312299728394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3376633822917938, "step": 1174 }, { "epoch": 0.02352, "grad_norm": 3.4375, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 5.53, "loss/crossentropy": 1.8047232627868652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29132279753685, "step": 1176 }, { "epoch": 0.02356, "grad_norm": 3.265625, "grad_norm_var": 0.020417277018229166, "learning_rate": 0.0001, "loss": 5.4149, "loss/crossentropy": 2.203469753265381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3124798536300659, "step": 1178 }, { "epoch": 0.0236, "grad_norm": 3.34375, "grad_norm_var": 0.019775390625, "learning_rate": 0.0001, "loss": 5.4481, "loss/crossentropy": 2.078102231025696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.331977054476738, "step": 1180 }, { "epoch": 0.02364, "grad_norm": 3.765625, "grad_norm_var": 0.0368316650390625, "learning_rate": 0.0001, "loss": 5.93, "loss/crossentropy": 2.2701854705810547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3202142268419266, "step": 1182 }, { "epoch": 0.02368, "grad_norm": 3.671875, "grad_norm_var": 0.047240193684895834, "learning_rate": 0.0001, "loss": 5.6414, "loss/crossentropy": 1.930423617362976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3192301094532013, "step": 1184 }, { "epoch": 0.02372, "grad_norm": 3.546875, "grad_norm_var": 0.06542867024739583, "learning_rate": 0.0001, "loss": 6.0688, "loss/crossentropy": 2.291581869125366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3759836256504059, "step": 1186 }, { "epoch": 0.02376, "grad_norm": 3.375, "grad_norm_var": 0.06529541015625, "learning_rate": 0.0001, "loss": 5.6689, "loss/crossentropy": 1.9887789487838745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31285202503204346, "step": 1188 }, { "epoch": 0.0238, "grad_norm": 3.109375, "grad_norm_var": 0.054671223958333334, "learning_rate": 0.0001, "loss": 5.7457, "loss/crossentropy": 2.290129065513611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3461494445800781, "step": 1190 }, { "epoch": 0.02384, "grad_norm": 3.6875, "grad_norm_var": 0.0604888916015625, "learning_rate": 0.0001, "loss": 6.288, "loss/crossentropy": 2.3252567052841187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3817393332719803, "step": 1192 }, { "epoch": 0.02388, "grad_norm": 3.21875, "grad_norm_var": 0.06230367024739583, "learning_rate": 0.0001, "loss": 5.4498, "loss/crossentropy": 1.736217737197876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28928153216838837, "step": 1194 }, { "epoch": 0.02392, "grad_norm": 3.09375, "grad_norm_var": 0.06886393229166667, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.112093210220337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3196643739938736, "step": 1196 }, { "epoch": 0.02396, "grad_norm": 3.15625, "grad_norm_var": 0.07258707682291667, "learning_rate": 0.0001, "loss": 5.3552, "loss/crossentropy": 2.3334370851516724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3539857119321823, "step": 1198 }, { "epoch": 0.024, "grad_norm": 3.484375, "grad_norm_var": 0.09695536295572917, "learning_rate": 0.0001, "loss": 5.3293, "loss/crossentropy": 1.7393567562103271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3211805671453476, "step": 1200 }, { "epoch": 0.02404, "grad_norm": 3.4375, "grad_norm_var": 0.07935791015625, "learning_rate": 0.0001, "loss": 5.6292, "loss/crossentropy": 2.314788579940796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37599092721939087, "step": 1202 }, { "epoch": 0.02408, "grad_norm": 3.1875, "grad_norm_var": 0.076318359375, "learning_rate": 0.0001, "loss": 5.435, "loss/crossentropy": 2.2820088863372803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33955833315849304, "step": 1204 }, { "epoch": 0.02412, "grad_norm": 3.46875, "grad_norm_var": 0.07183837890625, "learning_rate": 0.0001, "loss": 5.87, "loss/crossentropy": 2.012476146221161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3048281967639923, "step": 1206 }, { "epoch": 0.02416, "grad_norm": 3.1875, "grad_norm_var": 0.07356669108072916, "learning_rate": 0.0001, "loss": 5.6751, "loss/crossentropy": 2.315858483314514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32628118991851807, "step": 1208 }, { "epoch": 0.0242, "grad_norm": 3.359375, "grad_norm_var": 0.14773763020833333, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.3260581493377686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.352965384721756, "step": 1210 }, { "epoch": 0.02424, "grad_norm": 4.0625, "grad_norm_var": 0.1748687744140625, "learning_rate": 0.0001, "loss": 5.689, "loss/crossentropy": 2.199007749557495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3089260905981064, "step": 1212 }, { "epoch": 0.02428, "grad_norm": 3.703125, "grad_norm_var": 1.4942047119140625, "learning_rate": 0.0001, "loss": 5.798, "loss/crossentropy": 2.30281138420105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3629491478204727, "step": 1214 }, { "epoch": 0.02432, "grad_norm": 3.1875, "grad_norm_var": 1.5125640869140624, "learning_rate": 0.0001, "loss": 5.4927, "loss/crossentropy": 2.238295316696167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3436104357242584, "step": 1216 }, { "epoch": 0.02436, "grad_norm": 3.9375, "grad_norm_var": 1.501488240559896, "learning_rate": 0.0001, "loss": 5.5763, "loss/crossentropy": 2.52456271648407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40382860600948334, "step": 1218 }, { "epoch": 0.0244, "grad_norm": 3.46875, "grad_norm_var": 1.47197265625, "learning_rate": 0.0001, "loss": 5.8796, "loss/crossentropy": 2.4516665935516357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.378818541765213, "step": 1220 }, { "epoch": 0.02444, "grad_norm": 3.046875, "grad_norm_var": 1.485480753580729, "learning_rate": 0.0001, "loss": 5.5438, "loss/crossentropy": 2.593857169151306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3743816912174225, "step": 1222 }, { "epoch": 0.02448, "grad_norm": 3.375, "grad_norm_var": 1.4396799723307292, "learning_rate": 0.0001, "loss": 5.2807, "loss/crossentropy": 1.873874843120575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2743247449398041, "step": 1224 }, { "epoch": 0.02452, "grad_norm": 3.265625, "grad_norm_var": 1.4642862955729166, "learning_rate": 0.0001, "loss": 5.6455, "loss/crossentropy": 2.1173152923583984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.350399985909462, "step": 1226 }, { "epoch": 0.02456, "grad_norm": 3.5, "grad_norm_var": 1.4721018473307292, "learning_rate": 0.0001, "loss": 5.7318, "loss/crossentropy": 2.3650271892547607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3241463750600815, "step": 1228 }, { "epoch": 0.0246, "grad_norm": 3.1875, "grad_norm_var": 0.08748270670572916, "learning_rate": 0.0001, "loss": 5.5229, "loss/crossentropy": 2.180622935295105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31293927133083344, "step": 1230 }, { "epoch": 0.02464, "grad_norm": 3.546875, "grad_norm_var": 0.08329976399739583, "learning_rate": 0.0001, "loss": 5.7443, "loss/crossentropy": 2.230265259742737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3405953049659729, "step": 1232 }, { "epoch": 0.02468, "grad_norm": 3.6875, "grad_norm_var": 0.07164713541666666, "learning_rate": 0.0001, "loss": 5.5518, "loss/crossentropy": 2.050285518169403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32737791538238525, "step": 1234 }, { "epoch": 0.02472, "grad_norm": 3.140625, "grad_norm_var": 0.07604166666666666, "learning_rate": 0.0001, "loss": 5.4412, "loss/crossentropy": 2.2357693910598755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34366659820079803, "step": 1236 }, { "epoch": 0.02476, "grad_norm": 3.09375, "grad_norm_var": 0.09081624348958334, "learning_rate": 0.0001, "loss": 5.6066, "loss/crossentropy": 2.308778762817383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33825138211250305, "step": 1238 }, { "epoch": 0.0248, "grad_norm": 3.390625, "grad_norm_var": 0.0979888916015625, "learning_rate": 0.0001, "loss": 5.419, "loss/crossentropy": 2.016503393650055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30135589838027954, "step": 1240 }, { "epoch": 0.02484, "grad_norm": 3.359375, "grad_norm_var": 0.0870025634765625, "learning_rate": 0.0001, "loss": 5.2494, "loss/crossentropy": 1.8450073599815369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34388674795627594, "step": 1242 }, { "epoch": 0.02488, "grad_norm": 3.515625, "grad_norm_var": 0.05263671875, "learning_rate": 0.0001, "loss": 5.7092, "loss/crossentropy": 2.486730694770813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35796378552913666, "step": 1244 }, { "epoch": 0.02492, "grad_norm": 2.859375, "grad_norm_var": 0.0688385009765625, "learning_rate": 0.0001, "loss": 5.2565, "loss/crossentropy": 2.010675370693207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31364670395851135, "step": 1246 }, { "epoch": 0.02496, "grad_norm": 3.15625, "grad_norm_var": 0.07418619791666667, "learning_rate": 0.0001, "loss": 5.366, "loss/crossentropy": 2.128747880458832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3106560483574867, "step": 1248 }, { "epoch": 0.025, "grad_norm": 3.359375, "grad_norm_var": 0.06523335774739583, "learning_rate": 0.0001, "loss": 5.8005, "loss/crossentropy": 2.4563735723495483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33769866824150085, "step": 1250 }, { "epoch": 0.02504, "grad_norm": 3.15625, "grad_norm_var": 0.06453450520833333, "learning_rate": 0.0001, "loss": 5.372, "loss/crossentropy": 2.3785592317581177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36167748272418976, "step": 1252 }, { "epoch": 0.02508, "grad_norm": 3.21875, "grad_norm_var": 0.029069010416666666, "learning_rate": 0.0001, "loss": 5.5506, "loss/crossentropy": 2.2642308473587036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3315223157405853, "step": 1254 }, { "epoch": 0.02512, "grad_norm": 3.46875, "grad_norm_var": 0.028416951497395832, "learning_rate": 0.0001, "loss": 5.4608, "loss/crossentropy": 2.2246991395950317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33083613216876984, "step": 1256 }, { "epoch": 0.02516, "grad_norm": 3.1875, "grad_norm_var": 0.0290435791015625, "learning_rate": 0.0001, "loss": 5.762, "loss/crossentropy": 2.129785656929016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32672610878944397, "step": 1258 }, { "epoch": 0.0252, "grad_norm": 3.125, "grad_norm_var": 0.0243804931640625, "learning_rate": 0.0001, "loss": 5.7297, "loss/crossentropy": 2.0835453271865845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3504233658313751, "step": 1260 }, { "epoch": 0.02524, "grad_norm": 3.703125, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 5.6707, "loss/crossentropy": 2.443893313407898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3636191487312317, "step": 1262 }, { "epoch": 0.02528, "grad_norm": 3.21875, "grad_norm_var": 0.02197265625, "learning_rate": 0.0001, "loss": 5.6107, "loss/crossentropy": 2.367433190345764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35734108090400696, "step": 1264 }, { "epoch": 0.02532, "grad_norm": 3.1875, "grad_norm_var": 0.02662353515625, "learning_rate": 0.0001, "loss": 5.049, "loss/crossentropy": 1.8102024793624878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28676700592041016, "step": 1266 }, { "epoch": 0.02536, "grad_norm": 3.0, "grad_norm_var": 0.03738606770833333, "learning_rate": 0.0001, "loss": 5.3916, "loss/crossentropy": 2.54524827003479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3380637466907501, "step": 1268 }, { "epoch": 0.0254, "grad_norm": 2.984375, "grad_norm_var": 0.040913899739583336, "learning_rate": 0.0001, "loss": 5.6667, "loss/crossentropy": 2.4164276123046875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36949749290943146, "step": 1270 }, { "epoch": 0.02544, "grad_norm": 3.515625, "grad_norm_var": 0.043745930989583334, "learning_rate": 0.0001, "loss": 5.4235, "loss/crossentropy": 2.4335602521896362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3608807325363159, "step": 1272 }, { "epoch": 0.02548, "grad_norm": 3.1875, "grad_norm_var": 0.0422760009765625, "learning_rate": 0.0001, "loss": 5.5858, "loss/crossentropy": 2.2711308002471924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3631722033023834, "step": 1274 }, { "epoch": 0.02552, "grad_norm": 3.140625, "grad_norm_var": 0.0430816650390625, "learning_rate": 0.0001, "loss": 5.181, "loss/crossentropy": 2.378043293952942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35628968477249146, "step": 1276 }, { "epoch": 0.02556, "grad_norm": 3.1875, "grad_norm_var": 0.02431640625, "learning_rate": 0.0001, "loss": 5.5721, "loss/crossentropy": 1.8950039148330688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3105602264404297, "step": 1278 }, { "epoch": 0.0256, "grad_norm": 2.96875, "grad_norm_var": 0.026659138997395835, "learning_rate": 0.0001, "loss": 5.4649, "loss/crossentropy": 1.8309656977653503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27458132803440094, "step": 1280 }, { "epoch": 0.02564, "grad_norm": 3.40625, "grad_norm_var": 0.034195963541666666, "learning_rate": 0.0001, "loss": 5.993, "loss/crossentropy": 2.3949296474456787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37693680822849274, "step": 1282 }, { "epoch": 0.02568, "grad_norm": 3.140625, "grad_norm_var": 0.026725260416666667, "learning_rate": 0.0001, "loss": 5.6157, "loss/crossentropy": 2.497879147529602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36076007783412933, "step": 1284 }, { "epoch": 0.02572, "grad_norm": 2.953125, "grad_norm_var": 0.027339680989583334, "learning_rate": 0.0001, "loss": 5.354, "loss/crossentropy": 2.108432352542877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3115152269601822, "step": 1286 }, { "epoch": 0.02576, "grad_norm": 3.171875, "grad_norm_var": 0.0201568603515625, "learning_rate": 0.0001, "loss": 5.5424, "loss/crossentropy": 2.079313635826111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31575673818588257, "step": 1288 }, { "epoch": 0.0258, "grad_norm": 3.15625, "grad_norm_var": 0.020099894205729166, "learning_rate": 0.0001, "loss": 5.2594, "loss/crossentropy": 2.3390332460403442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3293873071670532, "step": 1290 }, { "epoch": 0.02584, "grad_norm": 3.15625, "grad_norm_var": 0.019612630208333332, "learning_rate": 0.0001, "loss": 5.2895, "loss/crossentropy": 2.180980920791626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30600421130657196, "step": 1292 }, { "epoch": 0.02588, "grad_norm": 2.9375, "grad_norm_var": 0.022858683268229166, "learning_rate": 0.0001, "loss": 5.2784, "loss/crossentropy": 2.0647836327552795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3114248663187027, "step": 1294 }, { "epoch": 0.02592, "grad_norm": 2.84375, "grad_norm_var": 0.02958984375, "learning_rate": 0.0001, "loss": 5.5063, "loss/crossentropy": 1.931971788406372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32757866382598877, "step": 1296 }, { "epoch": 0.02596, "grad_norm": 3.09375, "grad_norm_var": 0.0202301025390625, "learning_rate": 0.0001, "loss": 5.6389, "loss/crossentropy": 2.1180718541145325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3272206783294678, "step": 1298 }, { "epoch": 0.026, "grad_norm": 3.4375, "grad_norm_var": 0.024616495768229166, "learning_rate": 0.0001, "loss": 5.5069, "loss/crossentropy": 1.8535473346710205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3100028932094574, "step": 1300 }, { "epoch": 0.02604, "grad_norm": 4.5, "grad_norm_var": 0.16031901041666666, "learning_rate": 0.0001, "loss": 5.574, "loss/crossentropy": 1.9625197052955627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3074956685304642, "step": 1302 }, { "epoch": 0.02608, "grad_norm": 3.25, "grad_norm_var": 0.1673736572265625, "learning_rate": 0.0001, "loss": 5.3764, "loss/crossentropy": 2.248521149158478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33484284579753876, "step": 1304 }, { "epoch": 0.02612, "grad_norm": 2.921875, "grad_norm_var": 0.1750640869140625, "learning_rate": 0.0001, "loss": 5.8318, "loss/crossentropy": 2.6374051570892334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3604440838098526, "step": 1306 }, { "epoch": 0.02616, "grad_norm": 4.0625, "grad_norm_var": 0.22333984375, "learning_rate": 0.0001, "loss": 5.5151, "loss/crossentropy": 2.3213003873825073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3659953773021698, "step": 1308 }, { "epoch": 0.0262, "grad_norm": 3.734375, "grad_norm_var": 0.21988525390625, "learning_rate": 0.0001, "loss": 5.9334, "loss/crossentropy": 2.3527311086654663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3638792932033539, "step": 1310 }, { "epoch": 0.02624, "grad_norm": 3.140625, "grad_norm_var": 0.20551656087239584, "learning_rate": 0.0001, "loss": 5.4442, "loss/crossentropy": 1.6319801807403564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29070258140563965, "step": 1312 }, { "epoch": 0.02628, "grad_norm": 3.125, "grad_norm_var": 0.215966796875, "learning_rate": 0.0001, "loss": 5.1864, "loss/crossentropy": 1.986265480518341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30515219271183014, "step": 1314 }, { "epoch": 0.02632, "grad_norm": 2.828125, "grad_norm_var": 0.23336181640625, "learning_rate": 0.0001, "loss": 5.5808, "loss/crossentropy": 2.237283766269684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32876546680927277, "step": 1316 }, { "epoch": 0.02636, "grad_norm": 3.859375, "grad_norm_var": 0.13362223307291668, "learning_rate": 0.0001, "loss": 5.4868, "loss/crossentropy": 2.215467691421509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34449851512908936, "step": 1318 }, { "epoch": 0.0264, "grad_norm": 3.03125, "grad_norm_var": 0.12567952473958333, "learning_rate": 0.0001, "loss": 5.7427, "loss/crossentropy": 2.264952063560486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.387825608253479, "step": 1320 }, { "epoch": 0.02644, "grad_norm": 3.125, "grad_norm_var": 0.12009989420572917, "learning_rate": 0.0001, "loss": 5.5614, "loss/crossentropy": 2.0678945779800415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3228907287120819, "step": 1322 }, { "epoch": 0.02648, "grad_norm": 4.40625, "grad_norm_var": 0.212841796875, "learning_rate": 0.0001, "loss": 5.6259, "loss/crossentropy": 2.1414765119552612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31144315004348755, "step": 1324 }, { "epoch": 0.02652, "grad_norm": 4.03125, "grad_norm_var": 0.23813374837239584, "learning_rate": 0.0001, "loss": 5.8487, "loss/crossentropy": 2.3898890018463135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3721010088920593, "step": 1326 }, { "epoch": 0.02656, "grad_norm": 3.421875, "grad_norm_var": 0.23483784993489584, "learning_rate": 0.0001, "loss": 5.5417, "loss/crossentropy": 2.404030442237854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36486634612083435, "step": 1328 }, { "epoch": 0.0266, "grad_norm": 3.15625, "grad_norm_var": 0.22004801432291668, "learning_rate": 0.0001, "loss": 5.7535, "loss/crossentropy": 2.1436617970466614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091920465230942, "step": 1330 }, { "epoch": 0.02664, "grad_norm": 3.1875, "grad_norm_var": 0.2074615478515625, "learning_rate": 0.0001, "loss": 5.2832, "loss/crossentropy": 2.106055796146393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3344803601503372, "step": 1332 }, { "epoch": 0.02668, "grad_norm": 3.640625, "grad_norm_var": 0.19583333333333333, "learning_rate": 0.0001, "loss": 6.0091, "loss/crossentropy": 2.465435266494751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34129244089126587, "step": 1334 }, { "epoch": 0.02672, "grad_norm": 3.21875, "grad_norm_var": 0.18728841145833333, "learning_rate": 0.0001, "loss": 5.5569, "loss/crossentropy": 1.9109253883361816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30202071368694305, "step": 1336 }, { "epoch": 0.02676, "grad_norm": 3.0, "grad_norm_var": 0.1890045166015625, "learning_rate": 0.0001, "loss": 5.7093, "loss/crossentropy": 2.267784833908081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32984335720539093, "step": 1338 }, { "epoch": 0.0268, "grad_norm": 3.0625, "grad_norm_var": 0.07266337076822917, "learning_rate": 0.0001, "loss": 5.4432, "loss/crossentropy": 2.6131194829940796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797858655452728, "step": 1340 }, { "epoch": 0.02684, "grad_norm": 3.171875, "grad_norm_var": 0.031891886393229166, "learning_rate": 0.0001, "loss": 5.7456, "loss/crossentropy": 2.214663505554199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.340796560049057, "step": 1342 }, { "epoch": 0.02688, "grad_norm": 3.0, "grad_norm_var": 0.03245340983072917, "learning_rate": 0.0001, "loss": 5.4205, "loss/crossentropy": 2.0236783027648926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3101559728384018, "step": 1344 }, { "epoch": 0.02692, "grad_norm": 3.078125, "grad_norm_var": 0.030565388997395835, "learning_rate": 0.0001, "loss": 5.2671, "loss/crossentropy": 2.3260135650634766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.339004784822464, "step": 1346 }, { "epoch": 0.02696, "grad_norm": 2.96875, "grad_norm_var": 0.0317047119140625, "learning_rate": 0.0001, "loss": 5.6529, "loss/crossentropy": 2.177807927131653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3171197772026062, "step": 1348 }, { "epoch": 0.027, "grad_norm": 3.375, "grad_norm_var": 0.0202056884765625, "learning_rate": 0.0001, "loss": 5.4682, "loss/crossentropy": 2.351730227470398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34749266505241394, "step": 1350 }, { "epoch": 0.02704, "grad_norm": 2.953125, "grad_norm_var": 0.017867024739583334, "learning_rate": 0.0001, "loss": 5.3692, "loss/crossentropy": 2.2959564924240112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3270048499107361, "step": 1352 }, { "epoch": 0.02708, "grad_norm": 3.03125, "grad_norm_var": 0.016890462239583334, "learning_rate": 0.0001, "loss": 5.5243, "loss/crossentropy": 2.399070382118225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32505376636981964, "step": 1354 }, { "epoch": 0.02712, "grad_norm": 3.046875, "grad_norm_var": 0.01812744140625, "learning_rate": 0.0001, "loss": 5.3306, "loss/crossentropy": 1.9084061980247498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3073619455099106, "step": 1356 }, { "epoch": 0.02716, "grad_norm": 3.078125, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 5.3942, "loss/crossentropy": 2.1204254627227783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2885961979627609, "step": 1358 }, { "epoch": 0.0272, "grad_norm": 3.28125, "grad_norm_var": 0.014842732747395834, "learning_rate": 0.0001, "loss": 5.7313, "loss/crossentropy": 2.0167239904403687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3026747703552246, "step": 1360 }, { "epoch": 0.02724, "grad_norm": 3.171875, "grad_norm_var": 0.014742024739583333, "learning_rate": 0.0001, "loss": 5.3987, "loss/crossentropy": 1.9588357210159302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2976878881454468, "step": 1362 }, { "epoch": 0.02728, "grad_norm": 2.796875, "grad_norm_var": 0.019449869791666668, "learning_rate": 0.0001, "loss": 5.1658, "loss/crossentropy": 1.8169561624526978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2874959260225296, "step": 1364 }, { "epoch": 0.02732, "grad_norm": 3.1875, "grad_norm_var": 0.014354451497395834, "learning_rate": 0.0001, "loss": 5.5128, "loss/crossentropy": 2.258527398109436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34180621802806854, "step": 1366 }, { "epoch": 0.02736, "grad_norm": 3.09375, "grad_norm_var": 0.017284138997395834, "learning_rate": 0.0001, "loss": 5.4374, "loss/crossentropy": 2.1959571838378906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3124794214963913, "step": 1368 }, { "epoch": 0.0274, "grad_norm": 2.875, "grad_norm_var": 0.0198150634765625, "learning_rate": 0.0001, "loss": 5.2299, "loss/crossentropy": 2.1830934286117554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3260872811079025, "step": 1370 }, { "epoch": 0.02744, "grad_norm": 3.203125, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 5.6831, "loss/crossentropy": 2.411653518676758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3575899302959442, "step": 1372 }, { "epoch": 0.02748, "grad_norm": 3.140625, "grad_norm_var": 0.021512858072916665, "learning_rate": 0.0001, "loss": 5.3919, "loss/crossentropy": 2.1585222482681274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30734311044216156, "step": 1374 }, { "epoch": 0.02752, "grad_norm": 2.90625, "grad_norm_var": 0.023824055989583332, "learning_rate": 0.0001, "loss": 5.37, "loss/crossentropy": 2.3621217012405396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3373589664697647, "step": 1376 }, { "epoch": 0.02756, "grad_norm": 3.109375, "grad_norm_var": 0.023160807291666665, "learning_rate": 0.0001, "loss": 5.4712, "loss/crossentropy": 2.1203317046165466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28425413370132446, "step": 1378 }, { "epoch": 0.0276, "grad_norm": 3.296875, "grad_norm_var": 0.0223297119140625, "learning_rate": 0.0001, "loss": 5.5438, "loss/crossentropy": 2.2568705081939697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.296497106552124, "step": 1380 }, { "epoch": 0.02764, "grad_norm": 3.5625, "grad_norm_var": 0.03665364583333333, "learning_rate": 0.0001, "loss": 5.5613, "loss/crossentropy": 2.260026216506958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34534430503845215, "step": 1382 }, { "epoch": 0.02768, "grad_norm": 5.5625, "grad_norm_var": 0.4100494384765625, "learning_rate": 0.0001, "loss": 5.6217, "loss/crossentropy": 1.9400787949562073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32853779196739197, "step": 1384 }, { "epoch": 0.02772, "grad_norm": 3.390625, "grad_norm_var": 0.39205322265625, "learning_rate": 0.0001, "loss": 5.2009, "loss/crossentropy": 2.168904423713684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3421178460121155, "step": 1386 }, { "epoch": 0.02776, "grad_norm": 2.90625, "grad_norm_var": 0.405419921875, "learning_rate": 0.0001, "loss": 5.3992, "loss/crossentropy": 2.3474777936935425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31982940435409546, "step": 1388 }, { "epoch": 0.0278, "grad_norm": 3.140625, "grad_norm_var": 0.40615234375, "learning_rate": 0.0001, "loss": 5.5791, "loss/crossentropy": 2.3416868448257446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3400905281305313, "step": 1390 }, { "epoch": 0.02784, "grad_norm": 2.8125, "grad_norm_var": 0.4003570556640625, "learning_rate": 0.0001, "loss": 5.4413, "loss/crossentropy": 2.299672842025757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3143058717250824, "step": 1392 }, { "epoch": 0.02788, "grad_norm": 3.15625, "grad_norm_var": 0.3945058186848958, "learning_rate": 0.0001, "loss": 5.6231, "loss/crossentropy": 2.3058812618255615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33790935575962067, "step": 1394 }, { "epoch": 0.02792, "grad_norm": 3.03125, "grad_norm_var": 0.3947987874348958, "learning_rate": 0.0001, "loss": 5.324, "loss/crossentropy": 2.2137999534606934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33156970143318176, "step": 1396 }, { "epoch": 0.02796, "grad_norm": 4.1875, "grad_norm_var": 0.4427571614583333, "learning_rate": 0.0001, "loss": 5.4567, "loss/crossentropy": 2.04184353351593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32263143360614777, "step": 1398 }, { "epoch": 0.028, "grad_norm": 3.265625, "grad_norm_var": 0.09589436848958334, "learning_rate": 0.0001, "loss": 5.339, "loss/crossentropy": 2.0377472639083862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3136487454175949, "step": 1400 }, { "epoch": 0.02804, "grad_norm": 3.171875, "grad_norm_var": 0.09228413899739583, "learning_rate": 0.0001, "loss": 5.5838, "loss/crossentropy": 2.5366055965423584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3476262539625168, "step": 1402 }, { "epoch": 0.02808, "grad_norm": 3.1875, "grad_norm_var": 0.09063212076822917, "learning_rate": 0.0001, "loss": 5.2447, "loss/crossentropy": 2.088012456893921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31111815571784973, "step": 1404 }, { "epoch": 0.02812, "grad_norm": 3.53125, "grad_norm_var": 0.10530598958333333, "learning_rate": 0.0001, "loss": 5.7316, "loss/crossentropy": 2.1750329732894897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3138856291770935, "step": 1406 }, { "epoch": 0.02816, "grad_norm": 3.625, "grad_norm_var": 0.09811909993489583, "learning_rate": 0.0001, "loss": 5.4708, "loss/crossentropy": 1.977031648159027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3179774433374405, "step": 1408 }, { "epoch": 0.0282, "grad_norm": 3.296875, "grad_norm_var": 0.12561848958333333, "learning_rate": 0.0001, "loss": 5.3815, "loss/crossentropy": 2.0594210028648376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32200056314468384, "step": 1410 }, { "epoch": 0.02824, "grad_norm": 3.59375, "grad_norm_var": 0.11237691243489584, "learning_rate": 0.0001, "loss": 5.443, "loss/crossentropy": 2.3887473344802856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3303599953651428, "step": 1412 }, { "epoch": 0.02828, "grad_norm": 3.453125, "grad_norm_var": 0.06500244140625, "learning_rate": 0.0001, "loss": 5.5507, "loss/crossentropy": 2.188898801803589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3378629684448242, "step": 1414 }, { "epoch": 0.02832, "grad_norm": 3.171875, "grad_norm_var": 0.06982014973958334, "learning_rate": 0.0001, "loss": 5.458, "loss/crossentropy": 1.981561303138733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2814937084913254, "step": 1416 }, { "epoch": 0.02836, "grad_norm": 3.1875, "grad_norm_var": 0.0756011962890625, "learning_rate": 0.0001, "loss": 5.271, "loss/crossentropy": 2.2141716480255127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3429889380931854, "step": 1418 }, { "epoch": 0.0284, "grad_norm": 3.1875, "grad_norm_var": 0.0818267822265625, "learning_rate": 0.0001, "loss": 5.048, "loss/crossentropy": 2.0720977783203125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30601558089256287, "step": 1420 }, { "epoch": 0.02844, "grad_norm": 3.21875, "grad_norm_var": 0.07434488932291666, "learning_rate": 0.0001, "loss": 5.6407, "loss/crossentropy": 2.0516344904899597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3182393014431, "step": 1422 }, { "epoch": 0.02848, "grad_norm": 3.28125, "grad_norm_var": 0.070166015625, "learning_rate": 0.0001, "loss": 5.6532, "loss/crossentropy": 2.161284327507019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3383013904094696, "step": 1424 }, { "epoch": 0.02852, "grad_norm": 3.1875, "grad_norm_var": 0.03163960774739583, "learning_rate": 0.0001, "loss": 5.1594, "loss/crossentropy": 1.9955796599388123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3169983923435211, "step": 1426 }, { "epoch": 0.02856, "grad_norm": 2.984375, "grad_norm_var": 0.023421223958333334, "learning_rate": 0.0001, "loss": 5.3951, "loss/crossentropy": 2.1046979427337646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30594320595264435, "step": 1428 }, { "epoch": 0.0286, "grad_norm": 2.90625, "grad_norm_var": 0.020231119791666665, "learning_rate": 0.0001, "loss": 5.364, "loss/crossentropy": 1.9611601829528809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3039780706167221, "step": 1430 }, { "epoch": 0.02864, "grad_norm": 2.9375, "grad_norm_var": 0.021610514322916666, "learning_rate": 0.0001, "loss": 5.8823, "loss/crossentropy": 2.256209373474121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3315662145614624, "step": 1432 }, { "epoch": 0.02868, "grad_norm": 2.9375, "grad_norm_var": 0.030594889322916666, "learning_rate": 0.0001, "loss": 5.2821, "loss/crossentropy": 2.150561034679413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33046241104602814, "step": 1434 }, { "epoch": 0.02872, "grad_norm": 3.078125, "grad_norm_var": 0.024898274739583334, "learning_rate": 0.0001, "loss": 5.3609, "loss/crossentropy": 2.279554605484009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3621693551540375, "step": 1436 }, { "epoch": 0.02876, "grad_norm": 3.015625, "grad_norm_var": 0.0259918212890625, "learning_rate": 0.0001, "loss": 5.4323, "loss/crossentropy": 2.0183790922164917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33361808955669403, "step": 1438 }, { "epoch": 0.0288, "grad_norm": 4.15625, "grad_norm_var": 0.7352203369140625, "learning_rate": 0.0001, "loss": 5.4781, "loss/crossentropy": 1.964252531528473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31638333201408386, "step": 1440 }, { "epoch": 0.02884, "grad_norm": 3.046875, "grad_norm_var": 0.73902587890625, "learning_rate": 0.0001, "loss": 5.6907, "loss/crossentropy": 1.9271634817123413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3408525586128235, "step": 1442 }, { "epoch": 0.02888, "grad_norm": 2.953125, "grad_norm_var": 0.7411417643229167, "learning_rate": 0.0001, "loss": 5.4869, "loss/crossentropy": 2.4400887489318848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33455249667167664, "step": 1444 }, { "epoch": 0.02892, "grad_norm": 3.078125, "grad_norm_var": 0.7270904541015625, "learning_rate": 0.0001, "loss": 5.6179, "loss/crossentropy": 2.465711832046509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3528379648923874, "step": 1446 }, { "epoch": 0.02896, "grad_norm": 3.328125, "grad_norm_var": 0.7147288004557292, "learning_rate": 0.0001, "loss": 5.5125, "loss/crossentropy": 2.269619941711426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32898105680942535, "step": 1448 }, { "epoch": 0.029, "grad_norm": 2.984375, "grad_norm_var": 0.68287353515625, "learning_rate": 0.0001, "loss": 5.5775, "loss/crossentropy": 2.0418076515197754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3170912265777588, "step": 1450 }, { "epoch": 0.02904, "grad_norm": 3.15625, "grad_norm_var": 0.6841756184895833, "learning_rate": 0.0001, "loss": 5.5964, "loss/crossentropy": 2.291188359260559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35694004595279694, "step": 1452 }, { "epoch": 0.02908, "grad_norm": 2.75, "grad_norm_var": 0.7253163655598959, "learning_rate": 0.0001, "loss": 5.1608, "loss/crossentropy": 2.497802972793579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3271156847476959, "step": 1454 }, { "epoch": 0.02912, "grad_norm": 2.984375, "grad_norm_var": 0.0334625244140625, "learning_rate": 0.0001, "loss": 5.1527, "loss/crossentropy": 2.161794900894165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32509492337703705, "step": 1456 }, { "epoch": 0.02916, "grad_norm": 3.3125, "grad_norm_var": 0.03470052083333333, "learning_rate": 0.0001, "loss": 5.7939, "loss/crossentropy": 2.632015347480774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36461199820041656, "step": 1458 }, { "epoch": 0.0292, "grad_norm": 3.03125, "grad_norm_var": 0.03372395833333333, "learning_rate": 0.0001, "loss": 5.6233, "loss/crossentropy": 2.2478749752044678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3311367332935333, "step": 1460 }, { "epoch": 0.02924, "grad_norm": 4.03125, "grad_norm_var": 0.09263916015625, "learning_rate": 0.0001, "loss": 5.2528, "loss/crossentropy": 1.8451723456382751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31802159547805786, "step": 1462 }, { "epoch": 0.02928, "grad_norm": 3.71875, "grad_norm_var": 0.11772359212239583, "learning_rate": 0.0001, "loss": 5.6469, "loss/crossentropy": 2.6684207916259766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35483625531196594, "step": 1464 }, { "epoch": 0.02932, "grad_norm": 2.921875, "grad_norm_var": 0.11543680826822916, "learning_rate": 0.0001, "loss": 5.2866, "loss/crossentropy": 2.497614622116089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3422156721353531, "step": 1466 }, { "epoch": 0.02936, "grad_norm": 4.25, "grad_norm_var": 0.19032796223958334, "learning_rate": 0.0001, "loss": 5.7424, "loss/crossentropy": 2.750740170478821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.377558171749115, "step": 1468 }, { "epoch": 0.0294, "grad_norm": 2.8125, "grad_norm_var": 0.17550455729166667, "learning_rate": 0.0001, "loss": 5.1741, "loss/crossentropy": 1.9961607456207275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3166217654943466, "step": 1470 }, { "epoch": 0.02944, "grad_norm": 3.359375, "grad_norm_var": 0.1655914306640625, "learning_rate": 0.0001, "loss": 5.3114, "loss/crossentropy": 2.0874768495559692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2912164479494095, "step": 1472 }, { "epoch": 0.02948, "grad_norm": 2.953125, "grad_norm_var": 0.174072265625, "learning_rate": 0.0001, "loss": 5.2102, "loss/crossentropy": 2.112182080745697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29954925179481506, "step": 1474 }, { "epoch": 0.02952, "grad_norm": 3.6875, "grad_norm_var": 0.4074371337890625, "learning_rate": 0.0001, "loss": 5.7839, "loss/crossentropy": 2.1319644451141357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3002544492483139, "step": 1476 }, { "epoch": 0.02956, "grad_norm": 3.0625, "grad_norm_var": 0.37743733723958334, "learning_rate": 0.0001, "loss": 5.8305, "loss/crossentropy": 2.3029643297195435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3262677788734436, "step": 1478 }, { "epoch": 0.0296, "grad_norm": 3.15625, "grad_norm_var": 0.3826243082682292, "learning_rate": 0.0001, "loss": 5.6803, "loss/crossentropy": 2.8598941564559937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38816460967063904, "step": 1480 }, { "epoch": 0.02964, "grad_norm": 2.984375, "grad_norm_var": 0.37844950358072915, "learning_rate": 0.0001, "loss": 5.2177, "loss/crossentropy": 2.134063720703125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3128468096256256, "step": 1482 }, { "epoch": 0.02968, "grad_norm": 3.125, "grad_norm_var": 0.31961263020833336, "learning_rate": 0.0001, "loss": 5.5234, "loss/crossentropy": 2.481287717819214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3209179639816284, "step": 1484 }, { "epoch": 0.02972, "grad_norm": 3.3125, "grad_norm_var": 0.30549723307291665, "learning_rate": 0.0001, "loss": 5.3571, "loss/crossentropy": 2.0571895837783813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3121480941772461, "step": 1486 }, { "epoch": 0.02976, "grad_norm": 3.203125, "grad_norm_var": 0.30614827473958334, "learning_rate": 0.0001, "loss": 5.2725, "loss/crossentropy": 2.1930073499679565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3031492233276367, "step": 1488 }, { "epoch": 0.0298, "grad_norm": 3.046875, "grad_norm_var": 0.30426025390625, "learning_rate": 0.0001, "loss": 5.3256, "loss/crossentropy": 2.403902530670166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32206277549266815, "step": 1490 }, { "epoch": 0.02984, "grad_norm": 2.9375, "grad_norm_var": 0.0538482666015625, "learning_rate": 0.0001, "loss": 5.2859, "loss/crossentropy": 2.131115198135376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31651052832603455, "step": 1492 }, { "epoch": 0.02988, "grad_norm": 2.828125, "grad_norm_var": 0.05826416015625, "learning_rate": 0.0001, "loss": 5.1232, "loss/crossentropy": 2.024750769138336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3142934888601303, "step": 1494 }, { "epoch": 0.02992, "grad_norm": 2.984375, "grad_norm_var": 0.06279195149739583, "learning_rate": 0.0001, "loss": 5.3804, "loss/crossentropy": 1.9606707692146301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28610387444496155, "step": 1496 }, { "epoch": 0.02996, "grad_norm": 3.15625, "grad_norm_var": 0.062474568684895836, "learning_rate": 0.0001, "loss": 5.5731, "loss/crossentropy": 2.31516432762146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32875190675258636, "step": 1498 }, { "epoch": 0.03, "grad_norm": 3.359375, "grad_norm_var": 0.0711822509765625, "learning_rate": 0.0001, "loss": 5.7626, "loss/crossentropy": 2.295942783355713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3379499167203903, "step": 1500 }, { "epoch": 0.03004, "grad_norm": 3.21875, "grad_norm_var": 0.06961263020833333, "learning_rate": 0.0001, "loss": 5.5997, "loss/crossentropy": 2.1859577894210815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32143887877464294, "step": 1502 }, { "epoch": 0.03008, "grad_norm": 3.28125, "grad_norm_var": 0.07023824055989583, "learning_rate": 0.0001, "loss": 5.4159, "loss/crossentropy": 2.1852502822875977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3582807630300522, "step": 1504 }, { "epoch": 0.03012, "grad_norm": 2.75, "grad_norm_var": 0.07810770670572917, "learning_rate": 0.0001, "loss": 5.3011, "loss/crossentropy": 2.174897611141205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33067959547042847, "step": 1506 }, { "epoch": 0.03016, "grad_norm": 2.96875, "grad_norm_var": 0.04057515462239583, "learning_rate": 0.0001, "loss": 5.2775, "loss/crossentropy": 2.24343740940094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34563779830932617, "step": 1508 }, { "epoch": 0.0302, "grad_norm": 2.90625, "grad_norm_var": 0.043473307291666666, "learning_rate": 0.0001, "loss": 5.317, "loss/crossentropy": 1.9828822612762451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2887475937604904, "step": 1510 }, { "epoch": 0.03024, "grad_norm": 3.0, "grad_norm_var": 0.03655192057291667, "learning_rate": 0.0001, "loss": 5.3172, "loss/crossentropy": 2.2110393047332764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32290786504745483, "step": 1512 }, { "epoch": 0.03028, "grad_norm": 2.90625, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 5.2598, "loss/crossentropy": 2.3797603845596313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3262799382209778, "step": 1514 }, { "epoch": 0.03032, "grad_norm": 3.1875, "grad_norm_var": 0.0281158447265625, "learning_rate": 0.0001, "loss": 5.4107, "loss/crossentropy": 2.0183085799217224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2915680408477783, "step": 1516 }, { "epoch": 0.03036, "grad_norm": 3.78125, "grad_norm_var": 0.9888631184895833, "learning_rate": 0.0001, "loss": 5.4632, "loss/crossentropy": 1.875212013721466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30001460015773773, "step": 1518 }, { "epoch": 0.0304, "grad_norm": 3.0625, "grad_norm_var": 0.9916300455729167, "learning_rate": 0.0001, "loss": 5.4406, "loss/crossentropy": 2.1000564098358154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.354188472032547, "step": 1520 }, { "epoch": 0.03044, "grad_norm": 2.84375, "grad_norm_var": 0.980126953125, "learning_rate": 0.0001, "loss": 5.4837, "loss/crossentropy": 2.071643114089966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.333335280418396, "step": 1522 }, { "epoch": 0.03048, "grad_norm": 3.203125, "grad_norm_var": 0.9749664306640625, "learning_rate": 0.0001, "loss": 5.2716, "loss/crossentropy": 2.4253947734832764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31365686655044556, "step": 1524 }, { "epoch": 0.03052, "grad_norm": 3.0625, "grad_norm_var": 0.9605133056640625, "learning_rate": 0.0001, "loss": 5.1601, "loss/crossentropy": 1.967090904712677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30677899718284607, "step": 1526 }, { "epoch": 0.03056, "grad_norm": 3.15625, "grad_norm_var": 0.954443359375, "learning_rate": 0.0001, "loss": 5.0971, "loss/crossentropy": 2.112701952457428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3029911667108536, "step": 1528 }, { "epoch": 0.0306, "grad_norm": 4.5625, "grad_norm_var": 1.0084269205729166, "learning_rate": 0.0001, "loss": 5.6836, "loss/crossentropy": 2.5657063722610474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3484763503074646, "step": 1530 }, { "epoch": 0.03064, "grad_norm": 3.234375, "grad_norm_var": 1.110497029622396, "learning_rate": 0.0001, "loss": 5.3888, "loss/crossentropy": 2.1214585304260254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3243858218193054, "step": 1532 }, { "epoch": 0.03068, "grad_norm": 3.234375, "grad_norm_var": 0.35461832682291666, "learning_rate": 0.0001, "loss": 5.4662, "loss/crossentropy": 2.427902936935425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3428474962711334, "step": 1534 }, { "epoch": 0.03072, "grad_norm": 3.078125, "grad_norm_var": 0.36741434733072914, "learning_rate": 0.0001, "loss": 5.2956, "loss/crossentropy": 1.975690484046936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3136949688196182, "step": 1536 }, { "epoch": 0.03076, "grad_norm": 2.734375, "grad_norm_var": 0.38342692057291666, "learning_rate": 0.0001, "loss": 5.1233, "loss/crossentropy": 2.295845150947571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31703390181064606, "step": 1538 }, { "epoch": 0.0308, "grad_norm": 3.171875, "grad_norm_var": 0.37892252604166665, "learning_rate": 0.0001, "loss": 5.8286, "loss/crossentropy": 2.117497444152832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32601243257522583, "step": 1540 }, { "epoch": 0.03084, "grad_norm": 3.140625, "grad_norm_var": 0.49339192708333335, "learning_rate": 0.0001, "loss": 5.4065, "loss/crossentropy": 2.3824862241744995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3332519829273224, "step": 1542 }, { "epoch": 0.03088, "grad_norm": 3.140625, "grad_norm_var": 0.4940582275390625, "learning_rate": 0.0001, "loss": 5.0672, "loss/crossentropy": 1.9037857055664062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3143462985754013, "step": 1544 }, { "epoch": 0.03092, "grad_norm": 3.09375, "grad_norm_var": 0.4064737955729167, "learning_rate": 0.0001, "loss": 5.2328, "loss/crossentropy": 1.9191133379936218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29443541169166565, "step": 1546 }, { "epoch": 0.03096, "grad_norm": 2.734375, "grad_norm_var": 0.28172200520833335, "learning_rate": 0.0001, "loss": 5.3402, "loss/crossentropy": 2.216462254524231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31411711871623993, "step": 1548 }, { "epoch": 0.031, "grad_norm": 2.96875, "grad_norm_var": 0.227685546875, "learning_rate": 0.0001, "loss": 5.0455, "loss/crossentropy": 1.8064754605293274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2667318135499954, "step": 1550 }, { "epoch": 0.03104, "grad_norm": 3.453125, "grad_norm_var": 0.46340738932291664, "learning_rate": 0.0001, "loss": 5.5672, "loss/crossentropy": 2.488176465034485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3422655761241913, "step": 1552 }, { "epoch": 0.03108, "grad_norm": 2.859375, "grad_norm_var": 0.4529205322265625, "learning_rate": 0.0001, "loss": 5.4404, "loss/crossentropy": 2.5670164823532104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3462950587272644, "step": 1554 }, { "epoch": 0.03112, "grad_norm": 3.25, "grad_norm_var": 0.46499735514322915, "learning_rate": 0.0001, "loss": 5.0295, "loss/crossentropy": 2.0630581378936768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29241877794265747, "step": 1556 }, { "epoch": 0.03116, "grad_norm": 2.84375, "grad_norm_var": 0.30891011555989584, "learning_rate": 0.0001, "loss": 5.4751, "loss/crossentropy": 2.685954213142395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37143297493457794, "step": 1558 }, { "epoch": 0.0312, "grad_norm": 3.421875, "grad_norm_var": 0.311669921875, "learning_rate": 0.0001, "loss": 5.6995, "loss/crossentropy": 1.9786988496780396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40276341140270233, "step": 1560 }, { "epoch": 0.03124, "grad_norm": 3.0, "grad_norm_var": 0.31383056640625, "learning_rate": 0.0001, "loss": 5.6695, "loss/crossentropy": 2.1484411358833313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3264364004135132, "step": 1562 }, { "epoch": 0.03128, "grad_norm": 2.875, "grad_norm_var": 0.3103424072265625, "learning_rate": 0.0001, "loss": 5.3015, "loss/crossentropy": 2.1411852836608887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3156583160161972, "step": 1564 }, { "epoch": 0.03132, "grad_norm": 2.90625, "grad_norm_var": 0.31302083333333336, "learning_rate": 0.0001, "loss": 5.2911, "loss/crossentropy": 2.1509006023406982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3066476732492447, "step": 1566 }, { "epoch": 0.03136, "grad_norm": 2.90625, "grad_norm_var": 0.0416412353515625, "learning_rate": 0.0001, "loss": 5.1904, "loss/crossentropy": 1.7540676593780518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2705047130584717, "step": 1568 }, { "epoch": 0.0314, "grad_norm": 3.046875, "grad_norm_var": 0.0431640625, "learning_rate": 0.0001, "loss": 4.9637, "loss/crossentropy": 2.2091184854507446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2920738309621811, "step": 1570 }, { "epoch": 0.03144, "grad_norm": 2.984375, "grad_norm_var": 0.045735677083333336, "learning_rate": 0.0001, "loss": 5.6048, "loss/crossentropy": 1.8405091762542725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2981649935245514, "step": 1572 }, { "epoch": 0.03148, "grad_norm": 2.6875, "grad_norm_var": 0.0420806884765625, "learning_rate": 0.0001, "loss": 5.4258, "loss/crossentropy": 2.2292457818984985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30654460191726685, "step": 1574 }, { "epoch": 0.03152, "grad_norm": 4.21875, "grad_norm_var": 0.12868550618489583, "learning_rate": 0.0001, "loss": 5.8439, "loss/crossentropy": 2.653907895088196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35303865373134613, "step": 1576 }, { "epoch": 0.03156, "grad_norm": 3.234375, "grad_norm_var": 0.13888346354166667, "learning_rate": 0.0001, "loss": 5.6145, "loss/crossentropy": 2.6799376010894775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37383997440338135, "step": 1578 }, { "epoch": 0.0316, "grad_norm": 2.828125, "grad_norm_var": 0.14431864420572918, "learning_rate": 0.0001, "loss": 5.3321, "loss/crossentropy": 2.3438535928726196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30706922709941864, "step": 1580 }, { "epoch": 0.03164, "grad_norm": 2.96875, "grad_norm_var": 0.14254150390625, "learning_rate": 0.0001, "loss": 5.2139, "loss/crossentropy": 2.1885964274406433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30707718431949615, "step": 1582 }, { "epoch": 0.03168, "grad_norm": 2.984375, "grad_norm_var": 0.14042561848958332, "learning_rate": 0.0001, "loss": 5.1661, "loss/crossentropy": 2.0471584796905518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31723180413246155, "step": 1584 }, { "epoch": 0.03172, "grad_norm": 3.0, "grad_norm_var": 0.1363433837890625, "learning_rate": 0.0001, "loss": 5.227, "loss/crossentropy": 2.135176420211792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29839280247688293, "step": 1586 }, { "epoch": 0.03176, "grad_norm": 3.3125, "grad_norm_var": 0.13482666015625, "learning_rate": 0.0001, "loss": 5.6412, "loss/crossentropy": 2.4444775581359863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33574284613132477, "step": 1588 }, { "epoch": 0.0318, "grad_norm": 3.140625, "grad_norm_var": 0.12195536295572916, "learning_rate": 0.0001, "loss": 5.4289, "loss/crossentropy": 2.1725653409957886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.310588076710701, "step": 1590 }, { "epoch": 0.03184, "grad_norm": 2.875, "grad_norm_var": 0.04572652180989583, "learning_rate": 0.0001, "loss": 5.3727, "loss/crossentropy": 2.3610929250717163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32158929109573364, "step": 1592 }, { "epoch": 0.03188, "grad_norm": 2.84375, "grad_norm_var": 0.0502105712890625, "learning_rate": 0.0001, "loss": 4.8794, "loss/crossentropy": 1.9271156787872314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28283432126045227, "step": 1594 }, { "epoch": 0.03192, "grad_norm": 3.203125, "grad_norm_var": 0.04372456868489583, "learning_rate": 0.0001, "loss": 5.3912, "loss/crossentropy": 2.4196890592575073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3463610112667084, "step": 1596 }, { "epoch": 0.03196, "grad_norm": 2.71875, "grad_norm_var": 0.0500152587890625, "learning_rate": 0.0001, "loss": 5.1524, "loss/crossentropy": 2.207236647605896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33829018473625183, "step": 1598 }, { "epoch": 0.032, "grad_norm": 3.0625, "grad_norm_var": 0.04101155598958333, "learning_rate": 0.0001, "loss": 5.4724, "loss/crossentropy": 2.3757678270339966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3342677056789398, "step": 1600 }, { "epoch": 0.03204, "grad_norm": 2.921875, "grad_norm_var": 0.04512430826822917, "learning_rate": 0.0001, "loss": 5.1763, "loss/crossentropy": 2.2605016231536865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3062159866094589, "step": 1602 }, { "epoch": 0.03208, "grad_norm": 3.359375, "grad_norm_var": 0.04794921875, "learning_rate": 0.0001, "loss": 5.7139, "loss/crossentropy": 2.4536768198013306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36668023467063904, "step": 1604 }, { "epoch": 0.03212, "grad_norm": 3.09375, "grad_norm_var": 0.0523834228515625, "learning_rate": 0.0001, "loss": 5.137, "loss/crossentropy": 1.9870036244392395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2974477708339691, "step": 1606 }, { "epoch": 0.03216, "grad_norm": 3.015625, "grad_norm_var": 0.039876302083333336, "learning_rate": 0.0001, "loss": 5.3926, "loss/crossentropy": 2.1852506399154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3327452540397644, "step": 1608 }, { "epoch": 0.0322, "grad_norm": 3.140625, "grad_norm_var": 0.03128255208333333, "learning_rate": 0.0001, "loss": 5.4964, "loss/crossentropy": 2.2226197719573975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3110218793153763, "step": 1610 }, { "epoch": 0.03224, "grad_norm": 3.09375, "grad_norm_var": 0.029523722330729165, "learning_rate": 0.0001, "loss": 5.398, "loss/crossentropy": 1.8255922198295593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28216174244880676, "step": 1612 }, { "epoch": 0.03228, "grad_norm": 2.96875, "grad_norm_var": 0.023200480143229167, "learning_rate": 0.0001, "loss": 5.2777, "loss/crossentropy": 1.9663920998573303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3095496743917465, "step": 1614 }, { "epoch": 0.03232, "grad_norm": 2.953125, "grad_norm_var": 0.023908487955729165, "learning_rate": 0.0001, "loss": 5.1578, "loss/crossentropy": 2.2089942693710327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3307983875274658, "step": 1616 }, { "epoch": 0.03236, "grad_norm": 3.03125, "grad_norm_var": 0.01793212890625, "learning_rate": 0.0001, "loss": 5.3331, "loss/crossentropy": 2.261039137840271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.314799427986145, "step": 1618 }, { "epoch": 0.0324, "grad_norm": 2.859375, "grad_norm_var": 0.015555826822916667, "learning_rate": 0.0001, "loss": 5.1674, "loss/crossentropy": 2.350824236869812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3515756279230118, "step": 1620 }, { "epoch": 0.03244, "grad_norm": 2.9375, "grad_norm_var": 0.010749308268229167, "learning_rate": 0.0001, "loss": 5.4853, "loss/crossentropy": 2.2964736223220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3059935122728348, "step": 1622 }, { "epoch": 0.03248, "grad_norm": 2.859375, "grad_norm_var": 0.016597493489583334, "learning_rate": 0.0001, "loss": 5.2503, "loss/crossentropy": 2.36459481716156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33763329684734344, "step": 1624 }, { "epoch": 0.03252, "grad_norm": 2.984375, "grad_norm_var": 0.0126617431640625, "learning_rate": 0.0001, "loss": 5.2242, "loss/crossentropy": 2.165920853614807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31318019330501556, "step": 1626 }, { "epoch": 0.03256, "grad_norm": 2.859375, "grad_norm_var": 0.010628255208333333, "learning_rate": 0.0001, "loss": 5.2544, "loss/crossentropy": 2.326790690422058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3044355511665344, "step": 1628 }, { "epoch": 0.0326, "grad_norm": 2.96875, "grad_norm_var": 0.01129150390625, "learning_rate": 0.0001, "loss": 5.3369, "loss/crossentropy": 1.8848688006401062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.306812584400177, "step": 1630 }, { "epoch": 0.03264, "grad_norm": 2.953125, "grad_norm_var": 0.0134185791015625, "learning_rate": 0.0001, "loss": 5.351, "loss/crossentropy": 2.045863091945648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091462701559067, "step": 1632 }, { "epoch": 0.03268, "grad_norm": 2.78125, "grad_norm_var": 0.01148681640625, "learning_rate": 0.0001, "loss": 4.9742, "loss/crossentropy": 2.0707273483276367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3038959503173828, "step": 1634 }, { "epoch": 0.03272, "grad_norm": 2.859375, "grad_norm_var": 0.011237589518229167, "learning_rate": 0.0001, "loss": 5.2076, "loss/crossentropy": 2.0786932706832886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29513464868068695, "step": 1636 }, { "epoch": 0.03276, "grad_norm": 3.203125, "grad_norm_var": 0.01900634765625, "learning_rate": 0.0001, "loss": 5.4237, "loss/crossentropy": 2.1527108550071716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32102274894714355, "step": 1638 }, { "epoch": 0.0328, "grad_norm": 3.15625, "grad_norm_var": 0.021361287434895834, "learning_rate": 0.0001, "loss": 5.126, "loss/crossentropy": 2.0383081436157227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3253529220819473, "step": 1640 }, { "epoch": 0.03284, "grad_norm": 2.734375, "grad_norm_var": 0.021809895833333332, "learning_rate": 0.0001, "loss": 5.0775, "loss/crossentropy": 2.2801902294158936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32200203835964203, "step": 1642 }, { "epoch": 0.03288, "grad_norm": 2.953125, "grad_norm_var": 0.022184244791666665, "learning_rate": 0.0001, "loss": 5.3035, "loss/crossentropy": 2.164717435836792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2997436225414276, "step": 1644 }, { "epoch": 0.03292, "grad_norm": 3.1875, "grad_norm_var": 0.025191243489583334, "learning_rate": 0.0001, "loss": 5.5166, "loss/crossentropy": 2.389414072036743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32998231053352356, "step": 1646 }, { "epoch": 0.03296, "grad_norm": 2.921875, "grad_norm_var": 0.023949178059895833, "learning_rate": 0.0001, "loss": 5.3225, "loss/crossentropy": 2.09418523311615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33829881250858307, "step": 1648 }, { "epoch": 0.033, "grad_norm": 3.109375, "grad_norm_var": 0.021800740559895834, "learning_rate": 0.0001, "loss": 5.261, "loss/crossentropy": 2.324030637741089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30337512493133545, "step": 1650 }, { "epoch": 0.03304, "grad_norm": 3.109375, "grad_norm_var": 0.020992024739583334, "learning_rate": 0.0001, "loss": 5.4162, "loss/crossentropy": 1.8635556101799011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27077023684978485, "step": 1652 }, { "epoch": 0.03308, "grad_norm": 2.8125, "grad_norm_var": 0.0215972900390625, "learning_rate": 0.0001, "loss": 5.2371, "loss/crossentropy": 2.1776190996170044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30546560883522034, "step": 1654 }, { "epoch": 0.03312, "grad_norm": 3.1875, "grad_norm_var": 0.021240234375, "learning_rate": 0.0001, "loss": 5.1721, "loss/crossentropy": 2.1003682613372803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3304767310619354, "step": 1656 }, { "epoch": 0.03316, "grad_norm": 3.203125, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 5.8029, "loss/crossentropy": 2.4331823587417603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3653264045715332, "step": 1658 }, { "epoch": 0.0332, "grad_norm": 3.125, "grad_norm_var": 0.1084381103515625, "learning_rate": 0.0001, "loss": 5.7157, "loss/crossentropy": 1.9939777851104736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28948865830898285, "step": 1660 }, { "epoch": 0.03324, "grad_norm": 2.9375, "grad_norm_var": 0.10896809895833333, "learning_rate": 0.0001, "loss": 4.933, "loss/crossentropy": 1.9093859791755676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3007328063249588, "step": 1662 }, { "epoch": 0.03328, "grad_norm": 2.734375, "grad_norm_var": 0.11998697916666666, "learning_rate": 0.0001, "loss": 5.1861, "loss/crossentropy": 2.2847355604171753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2975463569164276, "step": 1664 }, { "epoch": 0.03332, "grad_norm": 3.046875, "grad_norm_var": 0.120166015625, "learning_rate": 0.0001, "loss": 5.3623, "loss/crossentropy": 2.0081310868263245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28596948087215424, "step": 1666 }, { "epoch": 0.03336, "grad_norm": 3.125, "grad_norm_var": 0.11974283854166666, "learning_rate": 0.0001, "loss": 5.2338, "loss/crossentropy": 2.189584493637085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30838510394096375, "step": 1668 }, { "epoch": 0.0334, "grad_norm": 2.796875, "grad_norm_var": 0.12802327473958333, "learning_rate": 0.0001, "loss": 5.2749, "loss/crossentropy": 2.204169988632202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091500401496887, "step": 1670 }, { "epoch": 0.03344, "grad_norm": 2.75, "grad_norm_var": 0.1338775634765625, "learning_rate": 0.0001, "loss": 5.2755, "loss/crossentropy": 2.195966899394989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30603383481502533, "step": 1672 }, { "epoch": 0.03348, "grad_norm": 3.0, "grad_norm_var": 0.1251953125, "learning_rate": 0.0001, "loss": 5.8768, "loss/crossentropy": 2.5402153730392456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3657945841550827, "step": 1674 }, { "epoch": 0.03352, "grad_norm": 3.09375, "grad_norm_var": 0.0277252197265625, "learning_rate": 0.0001, "loss": 5.5387, "loss/crossentropy": 2.1721729040145874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29539716243743896, "step": 1676 }, { "epoch": 0.03356, "grad_norm": 2.953125, "grad_norm_var": 0.027253214518229166, "learning_rate": 0.0001, "loss": 5.2355, "loss/crossentropy": 2.0591543912887573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29050062596797943, "step": 1678 }, { "epoch": 0.0336, "grad_norm": 2.765625, "grad_norm_var": 0.025846354166666665, "learning_rate": 0.0001, "loss": 5.4895, "loss/crossentropy": 2.1396639347076416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3261077404022217, "step": 1680 }, { "epoch": 0.03364, "grad_norm": 3.015625, "grad_norm_var": 0.024689737955729166, "learning_rate": 0.0001, "loss": 5.2094, "loss/crossentropy": 1.9553123712539673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3204822689294815, "step": 1682 }, { "epoch": 0.03368, "grad_norm": 2.78125, "grad_norm_var": 0.023746744791666666, "learning_rate": 0.0001, "loss": 5.3417, "loss/crossentropy": 2.4139883518218994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32602658867836, "step": 1684 }, { "epoch": 0.03372, "grad_norm": 2.90625, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 5.3479, "loss/crossentropy": 1.9849293231964111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32610756158828735, "step": 1686 }, { "epoch": 0.03376, "grad_norm": 2.953125, "grad_norm_var": 0.008622233072916667, "learning_rate": 0.0001, "loss": 5.6218, "loss/crossentropy": 2.698970675468445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3749641329050064, "step": 1688 }, { "epoch": 0.0338, "grad_norm": 3.0625, "grad_norm_var": 0.01285400390625, "learning_rate": 0.0001, "loss": 5.7617, "loss/crossentropy": 2.715620517730713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35715436935424805, "step": 1690 }, { "epoch": 0.03384, "grad_norm": 3.0, "grad_norm_var": 0.01201171875, "learning_rate": 0.0001, "loss": 5.5073, "loss/crossentropy": 2.7213666439056396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34585659205913544, "step": 1692 }, { "epoch": 0.03388, "grad_norm": 2.84375, "grad_norm_var": 0.016044108072916667, "learning_rate": 0.0001, "loss": 5.2674, "loss/crossentropy": 2.277606725692749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3140410780906677, "step": 1694 }, { "epoch": 0.03392, "grad_norm": 3.765625, "grad_norm_var": 0.05364176432291667, "learning_rate": 0.0001, "loss": 5.2633, "loss/crossentropy": 2.332197904586792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30997559428215027, "step": 1696 }, { "epoch": 0.03396, "grad_norm": 2.953125, "grad_norm_var": 0.05464579264322917, "learning_rate": 0.0001, "loss": 5.4323, "loss/crossentropy": 2.4413230419158936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3075388967990875, "step": 1698 }, { "epoch": 0.034, "grad_norm": 2.859375, "grad_norm_var": 0.0567535400390625, "learning_rate": 0.0001, "loss": 5.1099, "loss/crossentropy": 2.2601696252822876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3050367534160614, "step": 1700 }, { "epoch": 0.03404, "grad_norm": 2.640625, "grad_norm_var": 0.06572977701822917, "learning_rate": 0.0001, "loss": 4.9925, "loss/crossentropy": 2.2910414934158325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3248990923166275, "step": 1702 }, { "epoch": 0.03408, "grad_norm": 2.890625, "grad_norm_var": 0.06843159993489584, "learning_rate": 0.0001, "loss": 5.3814, "loss/crossentropy": 1.9898682832717896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27573561668395996, "step": 1704 }, { "epoch": 0.03412, "grad_norm": 2.96875, "grad_norm_var": 0.06398824055989584, "learning_rate": 0.0001, "loss": 5.1506, "loss/crossentropy": 2.1234602332115173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30724021792411804, "step": 1706 }, { "epoch": 0.03416, "grad_norm": 3.390625, "grad_norm_var": 0.0764801025390625, "learning_rate": 0.0001, "loss": 5.5433, "loss/crossentropy": 2.34807026386261, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32687780261039734, "step": 1708 }, { "epoch": 0.0342, "grad_norm": 2.71875, "grad_norm_var": 0.07669169108072917, "learning_rate": 0.0001, "loss": 5.1249, "loss/crossentropy": 2.17264860868454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3086177706718445, "step": 1710 }, { "epoch": 0.03424, "grad_norm": 2.96875, "grad_norm_var": 0.0303131103515625, "learning_rate": 0.0001, "loss": 5.3613, "loss/crossentropy": 2.1094497442245483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29483039677143097, "step": 1712 }, { "epoch": 0.03428, "grad_norm": 2.71875, "grad_norm_var": 0.03284505208333333, "learning_rate": 0.0001, "loss": 5.256, "loss/crossentropy": 2.2379074692726135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2799292802810669, "step": 1714 }, { "epoch": 0.03432, "grad_norm": 3.078125, "grad_norm_var": 0.21389567057291667, "learning_rate": 0.0001, "loss": 5.5834, "loss/crossentropy": 2.4616905450820923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33721986413002014, "step": 1716 }, { "epoch": 0.03436, "grad_norm": 2.953125, "grad_norm_var": 0.2042877197265625, "learning_rate": 0.0001, "loss": 5.4415, "loss/crossentropy": 2.383226990699768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3242805302143097, "step": 1718 }, { "epoch": 0.0344, "grad_norm": 2.921875, "grad_norm_var": 0.19931538899739584, "learning_rate": 0.0001, "loss": 5.5514, "loss/crossentropy": 2.495948314666748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3131762146949768, "step": 1720 }, { "epoch": 0.03444, "grad_norm": 3.109375, "grad_norm_var": 0.19524739583333334, "learning_rate": 0.0001, "loss": 5.6767, "loss/crossentropy": 2.1921653747558594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.320631667971611, "step": 1722 }, { "epoch": 0.03448, "grad_norm": 3.40625, "grad_norm_var": 0.20221354166666666, "learning_rate": 0.0001, "loss": 5.2, "loss/crossentropy": 2.0795475840568542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29602357745170593, "step": 1724 }, { "epoch": 0.03452, "grad_norm": 2.75, "grad_norm_var": 0.201171875, "learning_rate": 0.0001, "loss": 4.7489, "loss/crossentropy": 1.911207377910614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.287412166595459, "step": 1726 }, { "epoch": 0.03456, "grad_norm": 2.96875, "grad_norm_var": 0.20129801432291666, "learning_rate": 0.0001, "loss": 5.155, "loss/crossentropy": 2.0638214349746704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32240423560142517, "step": 1728 }, { "epoch": 0.0346, "grad_norm": 2.75, "grad_norm_var": 0.20066630045572917, "learning_rate": 0.0001, "loss": 5.3464, "loss/crossentropy": 2.355573534965515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3161381185054779, "step": 1730 }, { "epoch": 0.03464, "grad_norm": 2.8125, "grad_norm_var": 0.04185282389322917, "learning_rate": 0.0001, "loss": 5.1039, "loss/crossentropy": 2.2227123975753784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31909704208374023, "step": 1732 }, { "epoch": 0.03468, "grad_norm": 2.765625, "grad_norm_var": 0.03954671223958333, "learning_rate": 0.0001, "loss": 5.2249, "loss/crossentropy": 2.203602910041809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3238847255706787, "step": 1734 }, { "epoch": 0.03472, "grad_norm": 3.34375, "grad_norm_var": 0.07062174479166666, "learning_rate": 0.0001, "loss": 5.0623, "loss/crossentropy": 2.2696332335472107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3174774497747421, "step": 1736 }, { "epoch": 0.03476, "grad_norm": 2.78125, "grad_norm_var": 0.07385660807291666, "learning_rate": 0.0001, "loss": 5.0469, "loss/crossentropy": 1.8124465942382812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2645348161458969, "step": 1738 }, { "epoch": 0.0348, "grad_norm": 3.015625, "grad_norm_var": 0.0578765869140625, "learning_rate": 0.0001, "loss": 5.589, "loss/crossentropy": 2.1951464414596558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31606370210647583, "step": 1740 }, { "epoch": 0.03484, "grad_norm": 3.109375, "grad_norm_var": 0.05624593098958333, "learning_rate": 0.0001, "loss": 5.4867, "loss/crossentropy": 2.4413230419158936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33688417077064514, "step": 1742 }, { "epoch": 0.03488, "grad_norm": 3.09375, "grad_norm_var": 0.05705973307291667, "learning_rate": 0.0001, "loss": 5.7898, "loss/crossentropy": 2.1357219219207764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3476516157388687, "step": 1744 }, { "epoch": 0.03492, "grad_norm": 2.859375, "grad_norm_var": 0.0551666259765625, "learning_rate": 0.0001, "loss": 5.4465, "loss/crossentropy": 2.1557592153549194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3080083876848221, "step": 1746 }, { "epoch": 0.03496, "grad_norm": 2.859375, "grad_norm_var": 0.0512603759765625, "learning_rate": 0.0001, "loss": 5.4949, "loss/crossentropy": 2.3549705743789673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33814217150211334, "step": 1748 }, { "epoch": 0.035, "grad_norm": 3.109375, "grad_norm_var": 0.04348958333333333, "learning_rate": 0.0001, "loss": 5.5881, "loss/crossentropy": 2.382844924926758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31944599747657776, "step": 1750 }, { "epoch": 0.03504, "grad_norm": 2.765625, "grad_norm_var": 0.02138671875, "learning_rate": 0.0001, "loss": 5.2937, "loss/crossentropy": 2.3312920331954956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3346693813800812, "step": 1752 }, { "epoch": 0.03508, "grad_norm": 3.015625, "grad_norm_var": 0.019677734375, "learning_rate": 0.0001, "loss": 5.1981, "loss/crossentropy": 2.1921491026878357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31133508682250977, "step": 1754 }, { "epoch": 0.03512, "grad_norm": 2.71875, "grad_norm_var": 0.024494425455729166, "learning_rate": 0.0001, "loss": 4.8796, "loss/crossentropy": 2.0229611992836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26832816004753113, "step": 1756 }, { "epoch": 0.03516, "grad_norm": 2.75, "grad_norm_var": 0.025764973958333333, "learning_rate": 0.0001, "loss": 5.0322, "loss/crossentropy": 1.8138108849525452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2576165944337845, "step": 1758 }, { "epoch": 0.0352, "grad_norm": 2.75, "grad_norm_var": 0.0232574462890625, "learning_rate": 0.0001, "loss": 5.1276, "loss/crossentropy": 2.0019126534461975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28094957768917084, "step": 1760 }, { "epoch": 0.03524, "grad_norm": 2.703125, "grad_norm_var": 0.022554524739583335, "learning_rate": 0.0001, "loss": 5.1776, "loss/crossentropy": 2.400240898132324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30936548113822937, "step": 1762 }, { "epoch": 0.03528, "grad_norm": 3.328125, "grad_norm_var": 0.03495992024739583, "learning_rate": 0.0001, "loss": 4.9401, "loss/crossentropy": 2.019958734512329, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27741560339927673, "step": 1764 }, { "epoch": 0.03532, "grad_norm": 2.875, "grad_norm_var": 0.025032552083333333, "learning_rate": 0.0001, "loss": 5.0912, "loss/crossentropy": 1.9099596738815308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28769225627183914, "step": 1766 }, { "epoch": 0.03536, "grad_norm": 2.890625, "grad_norm_var": 0.024344889322916667, "learning_rate": 0.0001, "loss": 5.42, "loss/crossentropy": 2.2508283853530884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2945093661546707, "step": 1768 }, { "epoch": 0.0354, "grad_norm": 2.828125, "grad_norm_var": 0.0222320556640625, "learning_rate": 0.0001, "loss": 5.2516, "loss/crossentropy": 1.9332409501075745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26304441690444946, "step": 1770 }, { "epoch": 0.03544, "grad_norm": 2.875, "grad_norm_var": 0.022557576497395832, "learning_rate": 0.0001, "loss": 5.1896, "loss/crossentropy": 2.143627643585205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28407415747642517, "step": 1772 }, { "epoch": 0.03548, "grad_norm": 2.859375, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 5.4763, "loss/crossentropy": 2.32085120677948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3176119029521942, "step": 1774 }, { "epoch": 0.03552, "grad_norm": 2.828125, "grad_norm_var": 0.028539021809895832, "learning_rate": 0.0001, "loss": 5.1717, "loss/crossentropy": 1.7893801927566528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27407801151275635, "step": 1776 }, { "epoch": 0.03556, "grad_norm": 2.78125, "grad_norm_var": 0.027228800455729167, "learning_rate": 0.0001, "loss": 5.42, "loss/crossentropy": 2.206292986869812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3071517199277878, "step": 1778 }, { "epoch": 0.0356, "grad_norm": 2.84375, "grad_norm_var": 0.013263956705729166, "learning_rate": 0.0001, "loss": 5.1879, "loss/crossentropy": 2.1285043954849243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3164139539003372, "step": 1780 }, { "epoch": 0.03564, "grad_norm": 3.15625, "grad_norm_var": 0.018635050455729166, "learning_rate": 0.0001, "loss": 5.2826, "loss/crossentropy": 2.16570383310318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3052050769329071, "step": 1782 }, { "epoch": 0.03568, "grad_norm": 2.734375, "grad_norm_var": 0.019950358072916667, "learning_rate": 0.0001, "loss": 5.0921, "loss/crossentropy": 1.9799941778182983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655785381793976, "step": 1784 }, { "epoch": 0.03572, "grad_norm": 2.765625, "grad_norm_var": 0.0209625244140625, "learning_rate": 0.0001, "loss": 5.2468, "loss/crossentropy": 1.9801498651504517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2824363112449646, "step": 1786 }, { "epoch": 0.03576, "grad_norm": 2.828125, "grad_norm_var": 0.020947265625, "learning_rate": 0.0001, "loss": 5.0131, "loss/crossentropy": 1.5805786848068237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25234321504831314, "step": 1788 }, { "epoch": 0.0358, "grad_norm": 3.15625, "grad_norm_var": 0.021484375, "learning_rate": 0.0001, "loss": 5.296, "loss/crossentropy": 2.2434048652648926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2896551638841629, "step": 1790 }, { "epoch": 0.03584, "grad_norm": 2.78125, "grad_norm_var": 0.022411092122395834, "learning_rate": 0.0001, "loss": 5.0179, "loss/crossentropy": 1.9738762378692627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2959328889846802, "step": 1792 }, { "epoch": 0.03588, "grad_norm": 3.0625, "grad_norm_var": 0.0272857666015625, "learning_rate": 0.0001, "loss": 5.2317, "loss/crossentropy": 2.222583770751953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29915711283683777, "step": 1794 }, { "epoch": 0.03592, "grad_norm": 2.8125, "grad_norm_var": 0.032942708333333334, "learning_rate": 0.0001, "loss": 5.4192, "loss/crossentropy": 2.188909649848938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.294817179441452, "step": 1796 }, { "epoch": 0.03596, "grad_norm": 3.015625, "grad_norm_var": 0.030907185872395833, "learning_rate": 0.0001, "loss": 5.6303, "loss/crossentropy": 2.4745373725891113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32838208973407745, "step": 1798 }, { "epoch": 0.036, "grad_norm": 2.890625, "grad_norm_var": 0.028595987955729166, "learning_rate": 0.0001, "loss": 5.3466, "loss/crossentropy": 1.9314215779304504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29102426767349243, "step": 1800 }, { "epoch": 0.03604, "grad_norm": 2.796875, "grad_norm_var": 0.026276652018229166, "learning_rate": 0.0001, "loss": 4.9782, "loss/crossentropy": 2.0099900364875793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30170293152332306, "step": 1802 }, { "epoch": 0.03608, "grad_norm": 2.859375, "grad_norm_var": 0.024388631184895832, "learning_rate": 0.0001, "loss": 5.2506, "loss/crossentropy": 2.1273564100265503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3019937574863434, "step": 1804 }, { "epoch": 0.03612, "grad_norm": 2.96875, "grad_norm_var": 0.016341145833333334, "learning_rate": 0.0001, "loss": 5.1003, "loss/crossentropy": 2.065160095691681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2817380279302597, "step": 1806 }, { "epoch": 0.03616, "grad_norm": 2.828125, "grad_norm_var": 0.018561808268229167, "learning_rate": 0.0001, "loss": 5.1569, "loss/crossentropy": 2.262821078300476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33245067298412323, "step": 1808 }, { "epoch": 0.0362, "grad_norm": 2.625, "grad_norm_var": 0.020003255208333334, "learning_rate": 0.0001, "loss": 4.8323, "loss/crossentropy": 2.164163827896118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26008155941963196, "step": 1810 }, { "epoch": 0.03624, "grad_norm": 2.78125, "grad_norm_var": 0.012398274739583333, "learning_rate": 0.0001, "loss": 5.2517, "loss/crossentropy": 2.147629737854004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3073730617761612, "step": 1812 }, { "epoch": 0.03628, "grad_norm": 2.75, "grad_norm_var": 0.023900349934895832, "learning_rate": 0.0001, "loss": 5.1255, "loss/crossentropy": 2.04233980178833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2981158718466759, "step": 1814 }, { "epoch": 0.03632, "grad_norm": 3.0, "grad_norm_var": 0.026439412434895834, "learning_rate": 0.0001, "loss": 4.9465, "loss/crossentropy": 2.264409065246582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32124973833560944, "step": 1816 }, { "epoch": 0.03636, "grad_norm": 3.109375, "grad_norm_var": 0.033610026041666664, "learning_rate": 0.0001, "loss": 5.4324, "loss/crossentropy": 2.092079997062683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3318764269351959, "step": 1818 }, { "epoch": 0.0364, "grad_norm": 2.828125, "grad_norm_var": 0.03850504557291667, "learning_rate": 0.0001, "loss": 5.2766, "loss/crossentropy": 2.1007314324378967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2930498272180557, "step": 1820 }, { "epoch": 0.03644, "grad_norm": 2.84375, "grad_norm_var": 0.03697001139322917, "learning_rate": 0.0001, "loss": 5.129, "loss/crossentropy": 2.2377375960350037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30473431944847107, "step": 1822 }, { "epoch": 0.03648, "grad_norm": 3.296875, "grad_norm_var": 0.04684244791666667, "learning_rate": 0.0001, "loss": 5.4209, "loss/crossentropy": 2.0965787172317505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30526305735111237, "step": 1824 }, { "epoch": 0.03652, "grad_norm": 3.078125, "grad_norm_var": 0.04755859375, "learning_rate": 0.0001, "loss": 5.0359, "loss/crossentropy": 2.208711862564087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2727830111980438, "step": 1826 }, { "epoch": 0.03656, "grad_norm": 2.796875, "grad_norm_var": 0.050633748372395836, "learning_rate": 0.0001, "loss": 5.1692, "loss/crossentropy": 2.1706738471984863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3054092824459076, "step": 1828 }, { "epoch": 0.0366, "grad_norm": 2.796875, "grad_norm_var": 0.037262980143229166, "learning_rate": 0.0001, "loss": 5.2311, "loss/crossentropy": 2.0891621112823486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3111531287431717, "step": 1830 }, { "epoch": 0.03664, "grad_norm": 2.859375, "grad_norm_var": 0.03560791015625, "learning_rate": 0.0001, "loss": 5.0537, "loss/crossentropy": 2.0721256732940674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30227208137512207, "step": 1832 }, { "epoch": 0.03668, "grad_norm": 3.015625, "grad_norm_var": 0.03023681640625, "learning_rate": 0.0001, "loss": 5.1267, "loss/crossentropy": 2.2015734910964966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3185647875070572, "step": 1834 }, { "epoch": 0.03672, "grad_norm": 3.03125, "grad_norm_var": 0.026493326822916666, "learning_rate": 0.0001, "loss": 5.1973, "loss/crossentropy": 2.1985132694244385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3032165467739105, "step": 1836 }, { "epoch": 0.03676, "grad_norm": 2.890625, "grad_norm_var": 0.026334635416666665, "learning_rate": 0.0001, "loss": 5.2041, "loss/crossentropy": 2.170067548751831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3295029550790787, "step": 1838 }, { "epoch": 0.0368, "grad_norm": 2.921875, "grad_norm_var": 0.014615885416666667, "learning_rate": 0.0001, "loss": 5.0049, "loss/crossentropy": 2.113592267036438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775915116071701, "step": 1840 }, { "epoch": 0.03684, "grad_norm": 2.78125, "grad_norm_var": 0.0117828369140625, "learning_rate": 0.0001, "loss": 5.4221, "loss/crossentropy": 2.1905024647712708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33297547698020935, "step": 1842 }, { "epoch": 0.03688, "grad_norm": 2.828125, "grad_norm_var": 0.012919108072916666, "learning_rate": 0.0001, "loss": 5.1859, "loss/crossentropy": 2.252369999885559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29375749826431274, "step": 1844 }, { "epoch": 0.03692, "grad_norm": 3.015625, "grad_norm_var": 0.020182291666666668, "learning_rate": 0.0001, "loss": 4.8942, "loss/crossentropy": 1.7526759505271912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2707225978374481, "step": 1846 }, { "epoch": 0.03696, "grad_norm": 3.21875, "grad_norm_var": 0.028880818684895834, "learning_rate": 0.0001, "loss": 5.6529, "loss/crossentropy": 2.592544913291931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3520146906375885, "step": 1848 }, { "epoch": 0.037, "grad_norm": 2.8125, "grad_norm_var": 0.0278961181640625, "learning_rate": 0.0001, "loss": 4.9816, "loss/crossentropy": 1.8699345588684082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2852860391139984, "step": 1850 }, { "epoch": 0.03704, "grad_norm": 2.640625, "grad_norm_var": 0.03209228515625, "learning_rate": 0.0001, "loss": 5.1326, "loss/crossentropy": 2.2219313383102417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3210798054933548, "step": 1852 }, { "epoch": 0.03708, "grad_norm": 2.796875, "grad_norm_var": 0.0327056884765625, "learning_rate": 0.0001, "loss": 5.2301, "loss/crossentropy": 1.9926818013191223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26928839832544327, "step": 1854 }, { "epoch": 0.03712, "grad_norm": 2.828125, "grad_norm_var": 0.032515462239583334, "learning_rate": 0.0001, "loss": 5.0372, "loss/crossentropy": 2.019917130470276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28046920895576477, "step": 1856 }, { "epoch": 0.03716, "grad_norm": 4.84375, "grad_norm_var": 0.2762858072916667, "learning_rate": 0.0001, "loss": 5.6297, "loss/crossentropy": 2.2585690021514893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30598941445350647, "step": 1858 }, { "epoch": 0.0372, "grad_norm": 2.984375, "grad_norm_var": 0.27327067057291665, "learning_rate": 0.0001, "loss": 5.3508, "loss/crossentropy": 2.298324942588806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3494870364665985, "step": 1860 }, { "epoch": 0.03724, "grad_norm": 3.125, "grad_norm_var": 0.25745035807291666, "learning_rate": 0.0001, "loss": 5.54, "loss/crossentropy": 2.430496573448181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3227449208498001, "step": 1862 }, { "epoch": 0.03728, "grad_norm": 2.953125, "grad_norm_var": 0.26183980305989585, "learning_rate": 0.0001, "loss": 5.1876, "loss/crossentropy": 2.090576171875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2790074646472931, "step": 1864 }, { "epoch": 0.03732, "grad_norm": 3.0625, "grad_norm_var": 0.25806884765625, "learning_rate": 0.0001, "loss": 5.1799, "loss/crossentropy": 2.2794109582901, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3135898858308792, "step": 1866 }, { "epoch": 0.03736, "grad_norm": 2.640625, "grad_norm_var": 0.2597076416015625, "learning_rate": 0.0001, "loss": 4.9333, "loss/crossentropy": 2.2433481216430664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27558377385139465, "step": 1868 }, { "epoch": 0.0374, "grad_norm": 2.59375, "grad_norm_var": 0.26806233723958334, "learning_rate": 0.0001, "loss": 5.292, "loss/crossentropy": 2.3111730813980103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3172578364610672, "step": 1870 }, { "epoch": 0.03744, "grad_norm": 2.921875, "grad_norm_var": 0.270654296875, "learning_rate": 0.0001, "loss": 5.2364, "loss/crossentropy": 2.0028095841407776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35644619166851044, "step": 1872 }, { "epoch": 0.03748, "grad_norm": 2.828125, "grad_norm_var": 0.04501546223958333, "learning_rate": 0.0001, "loss": 5.2183, "loss/crossentropy": 2.347644329071045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3119680881500244, "step": 1874 }, { "epoch": 0.03752, "grad_norm": 2.65625, "grad_norm_var": 0.03474833170572917, "learning_rate": 0.0001, "loss": 5.0787, "loss/crossentropy": 2.118954062461853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28975334763526917, "step": 1876 }, { "epoch": 0.03756, "grad_norm": 3.0625, "grad_norm_var": 0.033568318684895834, "learning_rate": 0.0001, "loss": 5.6649, "loss/crossentropy": 2.4376548528671265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3358190506696701, "step": 1878 }, { "epoch": 0.0376, "grad_norm": 3.28125, "grad_norm_var": 0.042724609375, "learning_rate": 0.0001, "loss": 5.4924, "loss/crossentropy": 2.5907636880874634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3234570771455765, "step": 1880 }, { "epoch": 0.03764, "grad_norm": 2.859375, "grad_norm_var": 0.04352925618489583, "learning_rate": 0.0001, "loss": 5.0298, "loss/crossentropy": 1.574956238269806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26118964701890945, "step": 1882 }, { "epoch": 0.03768, "grad_norm": 3.203125, "grad_norm_var": 0.04599609375, "learning_rate": 0.0001, "loss": 5.3339, "loss/crossentropy": 2.3571736812591553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.308853879570961, "step": 1884 }, { "epoch": 0.03772, "grad_norm": 2.546875, "grad_norm_var": 0.03877665201822917, "learning_rate": 0.0001, "loss": 5.0807, "loss/crossentropy": 2.1266958117485046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.307782918214798, "step": 1886 }, { "epoch": 0.03776, "grad_norm": 3.0, "grad_norm_var": 0.03876546223958333, "learning_rate": 0.0001, "loss": 5.5402, "loss/crossentropy": 2.3529077768325806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32287272810935974, "step": 1888 }, { "epoch": 0.0378, "grad_norm": 3.015625, "grad_norm_var": 0.03905843098958333, "learning_rate": 0.0001, "loss": 5.4555, "loss/crossentropy": 2.277345299720764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32319171726703644, "step": 1890 }, { "epoch": 0.03784, "grad_norm": 2.859375, "grad_norm_var": 0.035008748372395836, "learning_rate": 0.0001, "loss": 5.01, "loss/crossentropy": 2.102940857410431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.292771652340889, "step": 1892 }, { "epoch": 0.03788, "grad_norm": 2.640625, "grad_norm_var": 0.03998921712239583, "learning_rate": 0.0001, "loss": 5.0784, "loss/crossentropy": 1.9744033813476562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2844446450471878, "step": 1894 }, { "epoch": 0.03792, "grad_norm": 2.625, "grad_norm_var": 0.03250325520833333, "learning_rate": 0.0001, "loss": 4.9859, "loss/crossentropy": 1.8222022652626038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28611525893211365, "step": 1896 }, { "epoch": 0.03796, "grad_norm": 2.96875, "grad_norm_var": 0.03135477701822917, "learning_rate": 0.0001, "loss": 5.0704, "loss/crossentropy": 2.1963966488838196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2988738566637039, "step": 1898 }, { "epoch": 0.038, "grad_norm": 3.046875, "grad_norm_var": 0.025927734375, "learning_rate": 0.0001, "loss": 5.1155, "loss/crossentropy": 1.9982789754867554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.312205046415329, "step": 1900 }, { "epoch": 0.03804, "grad_norm": 2.75, "grad_norm_var": 0.020612589518229165, "learning_rate": 0.0001, "loss": 5.2097, "loss/crossentropy": 2.1999258995056152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29916079342365265, "step": 1902 }, { "epoch": 0.03808, "grad_norm": 2.859375, "grad_norm_var": 0.016999308268229166, "learning_rate": 0.0001, "loss": 5.2233, "loss/crossentropy": 2.0725532174110413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3093830645084381, "step": 1904 }, { "epoch": 0.03812, "grad_norm": 3.03125, "grad_norm_var": 0.027079264322916668, "learning_rate": 0.0001, "loss": 5.7014, "loss/crossentropy": 2.2504276037216187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.332836389541626, "step": 1906 }, { "epoch": 0.03816, "grad_norm": 2.875, "grad_norm_var": 0.026927693684895834, "learning_rate": 0.0001, "loss": 5.3413, "loss/crossentropy": 2.1570577025413513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3210139572620392, "step": 1908 }, { "epoch": 0.0382, "grad_norm": 3.59375, "grad_norm_var": 0.05137430826822917, "learning_rate": 0.0001, "loss": 5.6215, "loss/crossentropy": 2.0739041566848755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091907352209091, "step": 1910 }, { "epoch": 0.03824, "grad_norm": 2.703125, "grad_norm_var": 0.0502838134765625, "learning_rate": 0.0001, "loss": 5.0772, "loss/crossentropy": 2.0542168021202087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2791624963283539, "step": 1912 }, { "epoch": 0.03828, "grad_norm": 2.65625, "grad_norm_var": 0.0561431884765625, "learning_rate": 0.0001, "loss": 4.9386, "loss/crossentropy": 1.9705287218093872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25483617186546326, "step": 1914 }, { "epoch": 0.03832, "grad_norm": 2.875, "grad_norm_var": 0.05819905598958333, "learning_rate": 0.0001, "loss": 5.054, "loss/crossentropy": 2.0234111547470093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3110152333974838, "step": 1916 }, { "epoch": 0.03836, "grad_norm": 3.15625, "grad_norm_var": 0.06004130045572917, "learning_rate": 0.0001, "loss": 5.2723, "loss/crossentropy": 2.010735273361206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2943515181541443, "step": 1918 }, { "epoch": 0.0384, "grad_norm": 2.734375, "grad_norm_var": 0.06575419108072916, "learning_rate": 0.0001, "loss": 4.9471, "loss/crossentropy": 2.1912686824798584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27914971113204956, "step": 1920 }, { "epoch": 0.03844, "grad_norm": 2.421875, "grad_norm_var": 0.0709869384765625, "learning_rate": 0.0001, "loss": 5.085, "loss/crossentropy": 1.9889940023422241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26166442036628723, "step": 1922 }, { "epoch": 0.03848, "grad_norm": 2.96875, "grad_norm_var": 0.07668863932291667, "learning_rate": 0.0001, "loss": 5.3534, "loss/crossentropy": 2.154898941516876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31905338168144226, "step": 1924 }, { "epoch": 0.03852, "grad_norm": 2.796875, "grad_norm_var": 0.03997395833333333, "learning_rate": 0.0001, "loss": 5.0847, "loss/crossentropy": 2.44269061088562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28736811876296997, "step": 1926 }, { "epoch": 0.03856, "grad_norm": 2.6875, "grad_norm_var": 0.04108784993489583, "learning_rate": 0.0001, "loss": 5.0585, "loss/crossentropy": 1.6790328621864319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32607489824295044, "step": 1928 }, { "epoch": 0.0386, "grad_norm": 3.046875, "grad_norm_var": 0.05115559895833333, "learning_rate": 0.0001, "loss": 5.336, "loss/crossentropy": 2.0223641991615295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2871186435222626, "step": 1930 }, { "epoch": 0.03864, "grad_norm": 2.8125, "grad_norm_var": 0.05756734212239583, "learning_rate": 0.0001, "loss": 5.549, "loss/crossentropy": 2.451051712036133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33973076939582825, "step": 1932 }, { "epoch": 0.03868, "grad_norm": 2.765625, "grad_norm_var": 0.052262369791666666, "learning_rate": 0.0001, "loss": 5.4403, "loss/crossentropy": 2.2884862422943115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29729554057121277, "step": 1934 }, { "epoch": 0.03872, "grad_norm": 2.890625, "grad_norm_var": 0.04889322916666667, "learning_rate": 0.0001, "loss": 5.6203, "loss/crossentropy": 2.113345444202423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36945630609989166, "step": 1936 }, { "epoch": 0.03876, "grad_norm": 2.71875, "grad_norm_var": 0.03752848307291667, "learning_rate": 0.0001, "loss": 5.1845, "loss/crossentropy": 2.139409840106964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2899101823568344, "step": 1938 }, { "epoch": 0.0388, "grad_norm": 2.953125, "grad_norm_var": 0.03622639973958333, "learning_rate": 0.0001, "loss": 5.5397, "loss/crossentropy": 2.1029305458068848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29830390214920044, "step": 1940 }, { "epoch": 0.03884, "grad_norm": 2.734375, "grad_norm_var": 0.03816630045572917, "learning_rate": 0.0001, "loss": 5.1383, "loss/crossentropy": 1.7736502885818481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27648696303367615, "step": 1942 }, { "epoch": 0.03888, "grad_norm": 2.671875, "grad_norm_var": 0.03942057291666667, "learning_rate": 0.0001, "loss": 5.0885, "loss/crossentropy": 2.0281469225883484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30329641699790955, "step": 1944 }, { "epoch": 0.03892, "grad_norm": 2.890625, "grad_norm_var": 0.03528544108072917, "learning_rate": 0.0001, "loss": 5.3265, "loss/crossentropy": 2.404891610145569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3563212752342224, "step": 1946 }, { "epoch": 0.03896, "grad_norm": 2.53125, "grad_norm_var": 0.03572489420572917, "learning_rate": 0.0001, "loss": 5.0657, "loss/crossentropy": 2.2187922596931458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2999056503176689, "step": 1948 }, { "epoch": 0.039, "grad_norm": 2.875, "grad_norm_var": 0.03566792805989583, "learning_rate": 0.0001, "loss": 5.0499, "loss/crossentropy": 2.3901994228363037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3288661539554596, "step": 1950 }, { "epoch": 0.03904, "grad_norm": 3.15625, "grad_norm_var": 0.04069722493489583, "learning_rate": 0.0001, "loss": 5.252, "loss/crossentropy": 2.274617314338684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2928028404712677, "step": 1952 }, { "epoch": 0.03908, "grad_norm": 2.859375, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 5.4463, "loss/crossentropy": 2.2478950023651123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3027127981185913, "step": 1954 }, { "epoch": 0.03912, "grad_norm": 2.78125, "grad_norm_var": 0.030321248372395835, "learning_rate": 0.0001, "loss": 5.2707, "loss/crossentropy": 2.0634876489639282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29374830424785614, "step": 1956 }, { "epoch": 0.03916, "grad_norm": 2.90625, "grad_norm_var": 0.0287994384765625, "learning_rate": 0.0001, "loss": 5.3441, "loss/crossentropy": 2.171326994895935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2995557487010956, "step": 1958 }, { "epoch": 0.0392, "grad_norm": 2.515625, "grad_norm_var": 0.040266927083333334, "learning_rate": 0.0001, "loss": 4.9937, "loss/crossentropy": 2.142563223838806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27376608550548553, "step": 1960 }, { "epoch": 0.03924, "grad_norm": 5.34375, "grad_norm_var": 0.42568359375, "learning_rate": 0.0001, "loss": 5.4145, "loss/crossentropy": 2.3189245462417603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37109237909317017, "step": 1962 }, { "epoch": 0.03928, "grad_norm": 3.203125, "grad_norm_var": 0.40812886555989586, "learning_rate": 0.0001, "loss": 5.1826, "loss/crossentropy": 2.1483139991760254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32499393820762634, "step": 1964 }, { "epoch": 0.03932, "grad_norm": 3.046875, "grad_norm_var": 0.404736328125, "learning_rate": 0.0001, "loss": 5.445, "loss/crossentropy": 2.2916383743286133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29367291927337646, "step": 1966 }, { "epoch": 0.03936, "grad_norm": 2.9375, "grad_norm_var": 0.3999582926432292, "learning_rate": 0.0001, "loss": 5.1663, "loss/crossentropy": 2.4217371940612793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31309331953525543, "step": 1968 }, { "epoch": 0.0394, "grad_norm": 2.9375, "grad_norm_var": 0.40103759765625, "learning_rate": 0.0001, "loss": 5.2057, "loss/crossentropy": 1.9491975903511047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29921913146972656, "step": 1970 }, { "epoch": 0.03944, "grad_norm": 2.78125, "grad_norm_var": 0.39126688639322915, "learning_rate": 0.0001, "loss": 5.1788, "loss/crossentropy": 2.144432306289673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29162150621414185, "step": 1972 }, { "epoch": 0.03948, "grad_norm": 2.765625, "grad_norm_var": 0.40748291015625, "learning_rate": 0.0001, "loss": 5.1336, "loss/crossentropy": 1.9492529034614563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2875414192676544, "step": 1974 }, { "epoch": 0.03952, "grad_norm": 2.75, "grad_norm_var": 0.4002593994140625, "learning_rate": 0.0001, "loss": 5.053, "loss/crossentropy": 1.9269813895225525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2755381464958191, "step": 1976 }, { "epoch": 0.03956, "grad_norm": 2.828125, "grad_norm_var": 0.03798726399739583, "learning_rate": 0.0001, "loss": 4.8879, "loss/crossentropy": 2.074360489845276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2987503558397293, "step": 1978 }, { "epoch": 0.0396, "grad_norm": 2.953125, "grad_norm_var": 0.02086181640625, "learning_rate": 0.0001, "loss": 4.8834, "loss/crossentropy": 2.257633090019226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2992274910211563, "step": 1980 }, { "epoch": 0.03964, "grad_norm": 2.75, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 4.9533, "loss/crossentropy": 1.8207083940505981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2752758115530014, "step": 1982 }, { "epoch": 0.03968, "grad_norm": 2.875, "grad_norm_var": 0.015623982747395833, "learning_rate": 0.0001, "loss": 5.343, "loss/crossentropy": 2.105292797088623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3131226450204849, "step": 1984 }, { "epoch": 0.03972, "grad_norm": 3.484375, "grad_norm_var": 0.05191650390625, "learning_rate": 0.0001, "loss": 5.4785, "loss/crossentropy": 2.1191373467445374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3175530731678009, "step": 1986 }, { "epoch": 0.03976, "grad_norm": 2.75, "grad_norm_var": 0.051878865559895834, "learning_rate": 0.0001, "loss": 4.9236, "loss/crossentropy": 2.2214397192001343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091724067926407, "step": 1988 }, { "epoch": 0.0398, "grad_norm": 2.875, "grad_norm_var": 0.05133056640625, "learning_rate": 0.0001, "loss": 5.0031, "loss/crossentropy": 1.7347424626350403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2825440764427185, "step": 1990 }, { "epoch": 0.03984, "grad_norm": 2.890625, "grad_norm_var": 0.0500396728515625, "learning_rate": 0.0001, "loss": 5.0951, "loss/crossentropy": 2.1566559076309204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2961925268173218, "step": 1992 }, { "epoch": 0.03988, "grad_norm": 2.703125, "grad_norm_var": 0.06396077473958334, "learning_rate": 0.0001, "loss": 4.9195, "loss/crossentropy": 2.2129205465316772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29672613739967346, "step": 1994 }, { "epoch": 0.03992, "grad_norm": 3.3125, "grad_norm_var": 0.07595113118489584, "learning_rate": 0.0001, "loss": 5.7534, "loss/crossentropy": 2.4702744483947754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3707122802734375, "step": 1996 }, { "epoch": 0.03996, "grad_norm": 2.84375, "grad_norm_var": 0.06731669108072917, "learning_rate": 0.0001, "loss": 5.0296, "loss/crossentropy": 2.0463536977767944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3124735355377197, "step": 1998 }, { "epoch": 0.04, "grad_norm": 2.796875, "grad_norm_var": 0.07281494140625, "learning_rate": 0.0001, "loss": 4.959, "loss/crossentropy": 2.1550235748291016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32177163660526276, "step": 2000 }, { "epoch": 0.04004, "grad_norm": 2.71875, "grad_norm_var": 0.059798177083333334, "learning_rate": 0.0001, "loss": 5.2078, "loss/crossentropy": 2.1312190890312195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29013920575380325, "step": 2002 }, { "epoch": 0.04008, "grad_norm": 2.75, "grad_norm_var": 0.06082356770833333, "learning_rate": 0.0001, "loss": 5.0086, "loss/crossentropy": 1.8546085357666016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2619543671607971, "step": 2004 }, { "epoch": 0.04012, "grad_norm": 3.015625, "grad_norm_var": 0.0561920166015625, "learning_rate": 0.0001, "loss": 5.3416, "loss/crossentropy": 2.262398660182953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28742220997810364, "step": 2006 }, { "epoch": 0.04016, "grad_norm": 3.109375, "grad_norm_var": 0.05734049479166667, "learning_rate": 0.0001, "loss": 5.4315, "loss/crossentropy": 2.156043767929077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30488699674606323, "step": 2008 }, { "epoch": 0.0402, "grad_norm": 2.765625, "grad_norm_var": 0.04290364583333333, "learning_rate": 0.0001, "loss": 4.8761, "loss/crossentropy": 1.925516963005066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2795708477497101, "step": 2010 }, { "epoch": 0.04024, "grad_norm": 2.6875, "grad_norm_var": 0.031281534830729166, "learning_rate": 0.0001, "loss": 5.0729, "loss/crossentropy": 1.947714388370514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27192793786525726, "step": 2012 }, { "epoch": 0.04028, "grad_norm": 2.625, "grad_norm_var": 0.038939412434895834, "learning_rate": 0.0001, "loss": 4.6214, "loss/crossentropy": 1.9584010243415833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835424840450287, "step": 2014 }, { "epoch": 0.04032, "grad_norm": 2.578125, "grad_norm_var": 0.04108784993489583, "learning_rate": 0.0001, "loss": 5.1974, "loss/crossentropy": 2.461808919906616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3104698956012726, "step": 2016 }, { "epoch": 0.04036, "grad_norm": 2.734375, "grad_norm_var": 0.021903483072916667, "learning_rate": 0.0001, "loss": 5.286, "loss/crossentropy": 2.094545900821686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30010756850242615, "step": 2018 }, { "epoch": 0.0404, "grad_norm": 2.78125, "grad_norm_var": 0.021923828125, "learning_rate": 0.0001, "loss": 5.1274, "loss/crossentropy": 2.353589177131653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29775144159793854, "step": 2020 }, { "epoch": 0.04044, "grad_norm": 3.375, "grad_norm_var": 10.55537821451823, "learning_rate": 0.0001, "loss": 5.3359, "loss/crossentropy": 2.4468252658843994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34534354507923126, "step": 2022 }, { "epoch": 0.04048, "grad_norm": 2.984375, "grad_norm_var": 10.529678344726562, "learning_rate": 0.0001, "loss": 5.5028, "loss/crossentropy": 2.2037755250930786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.303710475564003, "step": 2024 }, { "epoch": 0.04052, "grad_norm": 2.71875, "grad_norm_var": 10.546240234375, "learning_rate": 0.0001, "loss": 4.9229, "loss/crossentropy": 1.9658318161964417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2829178273677826, "step": 2026 }, { "epoch": 0.04056, "grad_norm": 2.734375, "grad_norm_var": 10.555729166666667, "learning_rate": 0.0001, "loss": 4.8996, "loss/crossentropy": 2.118351697921753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29469528794288635, "step": 2028 }, { "epoch": 0.0406, "grad_norm": 2.875, "grad_norm_var": 10.507957967122396, "learning_rate": 0.0001, "loss": 5.5817, "loss/crossentropy": 2.172826111316681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35036201775074005, "step": 2030 }, { "epoch": 0.04064, "grad_norm": 3.953125, "grad_norm_var": 10.432124837239583, "learning_rate": 0.0001, "loss": 5.4846, "loss/crossentropy": 2.185975730419159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3397497236728668, "step": 2032 }, { "epoch": 0.04068, "grad_norm": 2.78125, "grad_norm_var": 10.411026000976562, "learning_rate": 0.0001, "loss": 5.0145, "loss/crossentropy": 2.043874442577362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3034070134162903, "step": 2034 }, { "epoch": 0.04072, "grad_norm": 2.984375, "grad_norm_var": 10.380106608072916, "learning_rate": 0.0001, "loss": 5.3958, "loss/crossentropy": 2.3315287828445435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32198067009449005, "step": 2036 }, { "epoch": 0.04076, "grad_norm": 2.828125, "grad_norm_var": 0.1112213134765625, "learning_rate": 0.0001, "loss": 5.0266, "loss/crossentropy": 2.05656898021698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30919137597084045, "step": 2038 }, { "epoch": 0.0408, "grad_norm": 2.578125, "grad_norm_var": 0.10188395182291667, "learning_rate": 0.0001, "loss": 5.3205, "loss/crossentropy": 2.2451635599136353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3412973880767822, "step": 2040 }, { "epoch": 0.04084, "grad_norm": 3.1875, "grad_norm_var": 0.10715738932291667, "learning_rate": 0.0001, "loss": 5.1734, "loss/crossentropy": 2.527924060821533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3178148865699768, "step": 2042 }, { "epoch": 0.04088, "grad_norm": 2.78125, "grad_norm_var": 0.10305582682291667, "learning_rate": 0.0001, "loss": 4.8441, "loss/crossentropy": 2.03126460313797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29549214243888855, "step": 2044 }, { "epoch": 0.04092, "grad_norm": 2.515625, "grad_norm_var": 0.11038004557291667, "learning_rate": 0.0001, "loss": 5.0134, "loss/crossentropy": 2.029997706413269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27938202023506165, "step": 2046 }, { "epoch": 0.04096, "grad_norm": 2.703125, "grad_norm_var": 0.03211263020833333, "learning_rate": 0.0001, "loss": 4.9321, "loss/crossentropy": 1.764098048210144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27712512016296387, "step": 2048 }, { "epoch": 0.041, "grad_norm": 3.046875, "grad_norm_var": 0.0356597900390625, "learning_rate": 0.0001, "loss": 5.435, "loss/crossentropy": 2.605324864387512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3540754020214081, "step": 2050 }, { "epoch": 0.04104, "grad_norm": 2.875, "grad_norm_var": 0.0349761962890625, "learning_rate": 0.0001, "loss": 5.0207, "loss/crossentropy": 1.9333613514900208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29670488089323044, "step": 2052 }, { "epoch": 0.04108, "grad_norm": 3.078125, "grad_norm_var": 0.040022786458333334, "learning_rate": 0.0001, "loss": 5.0056, "loss/crossentropy": 1.7876797914505005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2579014301300049, "step": 2054 }, { "epoch": 0.04112, "grad_norm": 3.125, "grad_norm_var": 0.0417877197265625, "learning_rate": 0.0001, "loss": 5.1401, "loss/crossentropy": 2.0947588682174683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3334304690361023, "step": 2056 }, { "epoch": 0.04116, "grad_norm": 3.0, "grad_norm_var": 0.035965983072916666, "learning_rate": 0.0001, "loss": 4.9009, "loss/crossentropy": 2.1838767528533936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.306907519698143, "step": 2058 }, { "epoch": 0.0412, "grad_norm": 3.015625, "grad_norm_var": 0.0341461181640625, "learning_rate": 0.0001, "loss": 5.3724, "loss/crossentropy": 2.2180997133255005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.314627081155777, "step": 2060 }, { "epoch": 0.04124, "grad_norm": 2.875, "grad_norm_var": 0.025419108072916665, "learning_rate": 0.0001, "loss": 4.8362, "loss/crossentropy": 1.914646863937378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2848198413848877, "step": 2062 }, { "epoch": 0.04128, "grad_norm": 2.609375, "grad_norm_var": 0.0262603759765625, "learning_rate": 0.0001, "loss": 5.4538, "loss/crossentropy": 2.42458713054657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31268230080604553, "step": 2064 }, { "epoch": 0.04132, "grad_norm": 2.828125, "grad_norm_var": 0.02457275390625, "learning_rate": 0.0001, "loss": 5.2497, "loss/crossentropy": 2.23202121257782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30988384783267975, "step": 2066 }, { "epoch": 0.04136, "grad_norm": 2.5625, "grad_norm_var": 0.0284820556640625, "learning_rate": 0.0001, "loss": 5.0416, "loss/crossentropy": 2.0225483179092407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26953594386577606, "step": 2068 }, { "epoch": 0.0414, "grad_norm": 2.875, "grad_norm_var": 0.028669230143229165, "learning_rate": 0.0001, "loss": 4.8259, "loss/crossentropy": 1.8593338131904602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27706706523895264, "step": 2070 }, { "epoch": 0.04144, "grad_norm": 2.59375, "grad_norm_var": 0.023387654622395834, "learning_rate": 0.0001, "loss": 4.9949, "loss/crossentropy": 2.373727560043335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3000074476003647, "step": 2072 }, { "epoch": 0.04148, "grad_norm": 2.578125, "grad_norm_var": 0.022704060872395834, "learning_rate": 0.0001, "loss": 4.9438, "loss/crossentropy": 1.959564983844757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26365046203136444, "step": 2074 }, { "epoch": 0.04152, "grad_norm": 2.921875, "grad_norm_var": 0.014159138997395833, "learning_rate": 0.0001, "loss": 4.983, "loss/crossentropy": 2.0590676069259644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2816864550113678, "step": 2076 }, { "epoch": 0.04156, "grad_norm": 2.84375, "grad_norm_var": 0.017121378580729166, "learning_rate": 0.0001, "loss": 5.2049, "loss/crossentropy": 2.147680163383484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29019051790237427, "step": 2078 }, { "epoch": 0.0416, "grad_norm": 2.609375, "grad_norm_var": 0.020198567708333334, "learning_rate": 0.0001, "loss": 5.6103, "loss/crossentropy": 2.212267220020294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2804659754037857, "step": 2080 }, { "epoch": 0.04164, "grad_norm": 2.65625, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 5.0489, "loss/crossentropy": 2.144743025302887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2946523129940033, "step": 2082 }, { "epoch": 0.04168, "grad_norm": 2.703125, "grad_norm_var": 0.019172159830729167, "learning_rate": 0.0001, "loss": 4.9311, "loss/crossentropy": 2.3702481985092163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2923210561275482, "step": 2084 }, { "epoch": 0.04172, "grad_norm": 2.96875, "grad_norm_var": 0.019684855143229166, "learning_rate": 0.0001, "loss": 5.2291, "loss/crossentropy": 1.9512975811958313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28704553842544556, "step": 2086 }, { "epoch": 0.04176, "grad_norm": 3.25, "grad_norm_var": 0.25388895670572914, "learning_rate": 0.0001, "loss": 4.9572, "loss/crossentropy": 2.180745005607605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28361976146698, "step": 2088 }, { "epoch": 0.0418, "grad_norm": 2.671875, "grad_norm_var": 0.24712626139322916, "learning_rate": 0.0001, "loss": 4.8579, "loss/crossentropy": 1.9768275022506714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28042787313461304, "step": 2090 }, { "epoch": 0.04184, "grad_norm": 2.90625, "grad_norm_var": 0.24544270833333334, "learning_rate": 0.0001, "loss": 5.2602, "loss/crossentropy": 2.148472547531128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30099035799503326, "step": 2092 }, { "epoch": 0.04188, "grad_norm": 2.734375, "grad_norm_var": 0.24763081868489584, "learning_rate": 0.0001, "loss": 5.0152, "loss/crossentropy": 2.1698715686798096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3140450567007065, "step": 2094 }, { "epoch": 0.04192, "grad_norm": 2.828125, "grad_norm_var": 0.24172770182291667, "learning_rate": 0.0001, "loss": 4.8679, "loss/crossentropy": 2.1142334938049316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2847675681114197, "step": 2096 }, { "epoch": 0.04196, "grad_norm": 2.703125, "grad_norm_var": 0.2395660400390625, "learning_rate": 0.0001, "loss": 5.2185, "loss/crossentropy": 2.1908479928970337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28702451288700104, "step": 2098 }, { "epoch": 0.042, "grad_norm": 2.6875, "grad_norm_var": 0.23321024576822916, "learning_rate": 0.0001, "loss": 5.0212, "loss/crossentropy": 2.0519612431526184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29224735498428345, "step": 2100 }, { "epoch": 0.04204, "grad_norm": 2.96875, "grad_norm_var": 0.2412994384765625, "learning_rate": 0.0001, "loss": 4.871, "loss/crossentropy": 1.9304961562156677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28785137832164764, "step": 2102 }, { "epoch": 0.04208, "grad_norm": 2.765625, "grad_norm_var": 0.0134185791015625, "learning_rate": 0.0001, "loss": 5.2462, "loss/crossentropy": 2.297300934791565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.302143856883049, "step": 2104 }, { "epoch": 0.04212, "grad_norm": 2.453125, "grad_norm_var": 0.019782511393229167, "learning_rate": 0.0001, "loss": 5.0491, "loss/crossentropy": 2.2764381170272827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28773219883441925, "step": 2106 }, { "epoch": 0.04216, "grad_norm": 2.625, "grad_norm_var": 0.019391886393229165, "learning_rate": 0.0001, "loss": 5.0563, "loss/crossentropy": 2.141321837902069, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3108212947845459, "step": 2108 }, { "epoch": 0.0422, "grad_norm": 2.71875, "grad_norm_var": 0.0185699462890625, "learning_rate": 0.0001, "loss": 5.0362, "loss/crossentropy": 1.9619495272636414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2938811331987381, "step": 2110 }, { "epoch": 0.04224, "grad_norm": 3.015625, "grad_norm_var": 0.026325480143229166, "learning_rate": 0.0001, "loss": 5.4496, "loss/crossentropy": 1.9741051197052002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28835102915763855, "step": 2112 }, { "epoch": 0.04228, "grad_norm": 2.546875, "grad_norm_var": 0.029255167643229166, "learning_rate": 0.0001, "loss": 4.9303, "loss/crossentropy": 1.9510936737060547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2839510589838028, "step": 2114 }, { "epoch": 0.04232, "grad_norm": 2.828125, "grad_norm_var": 0.026753743489583332, "learning_rate": 0.0001, "loss": 5.2446, "loss/crossentropy": 2.0201885104179382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30627067387104034, "step": 2116 }, { "epoch": 0.04236, "grad_norm": 2.6875, "grad_norm_var": 0.022493489583333335, "learning_rate": 0.0001, "loss": 5.1411, "loss/crossentropy": 2.4522262811660767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.308579683303833, "step": 2118 }, { "epoch": 0.0424, "grad_norm": 3.09375, "grad_norm_var": 0.04345296223958333, "learning_rate": 0.0001, "loss": 5.5535, "loss/crossentropy": 1.9289590120315552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29346515238285065, "step": 2120 }, { "epoch": 0.04244, "grad_norm": 2.8125, "grad_norm_var": 0.03437398274739583, "learning_rate": 0.0001, "loss": 5.0588, "loss/crossentropy": 2.2020061016082764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2992282509803772, "step": 2122 }, { "epoch": 0.04248, "grad_norm": 2.78125, "grad_norm_var": 0.031494140625, "learning_rate": 0.0001, "loss": 5.2466, "loss/crossentropy": 2.180301785469055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28323256969451904, "step": 2124 }, { "epoch": 0.04252, "grad_norm": 2.78125, "grad_norm_var": 0.03435872395833333, "learning_rate": 0.0001, "loss": 4.9061, "loss/crossentropy": 2.1250513792037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.272533118724823, "step": 2126 }, { "epoch": 0.04256, "grad_norm": 2.78125, "grad_norm_var": 0.037873331705729166, "learning_rate": 0.0001, "loss": 5.4375, "loss/crossentropy": 2.3509981632232666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3719516545534134, "step": 2128 }, { "epoch": 0.0426, "grad_norm": 3.078125, "grad_norm_var": 0.03508707682291667, "learning_rate": 0.0001, "loss": 5.3067, "loss/crossentropy": 2.135426163673401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3582882583141327, "step": 2130 }, { "epoch": 0.04264, "grad_norm": 2.578125, "grad_norm_var": 0.0398590087890625, "learning_rate": 0.0001, "loss": 5.2406, "loss/crossentropy": 2.316452383995056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30162203311920166, "step": 2132 }, { "epoch": 0.04268, "grad_norm": 2.765625, "grad_norm_var": 0.03846028645833333, "learning_rate": 0.0001, "loss": 5.0372, "loss/crossentropy": 2.0325432419776917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.287256121635437, "step": 2134 }, { "epoch": 0.04272, "grad_norm": 2.6875, "grad_norm_var": 0.0236236572265625, "learning_rate": 0.0001, "loss": 5.1985, "loss/crossentropy": 2.070056974887848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2552843391895294, "step": 2136 }, { "epoch": 0.04276, "grad_norm": 3.078125, "grad_norm_var": 0.029157511393229165, "learning_rate": 0.0001, "loss": 5.0623, "loss/crossentropy": 1.7005944848060608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24984879791736603, "step": 2138 }, { "epoch": 0.0428, "grad_norm": 2.75, "grad_norm_var": 0.03203125, "learning_rate": 0.0001, "loss": 5.0862, "loss/crossentropy": 1.6700931787490845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2572527676820755, "step": 2140 }, { "epoch": 0.04284, "grad_norm": 2.65625, "grad_norm_var": 0.03125, "learning_rate": 0.0001, "loss": 5.0186, "loss/crossentropy": 2.3074774742126465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31137382984161377, "step": 2142 }, { "epoch": 0.04288, "grad_norm": 2.84375, "grad_norm_var": 0.0229400634765625, "learning_rate": 0.0001, "loss": 5.1973, "loss/crossentropy": 2.103408098220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30157215893268585, "step": 2144 }, { "epoch": 0.04292, "grad_norm": 2.796875, "grad_norm_var": 0.020979817708333334, "learning_rate": 0.0001, "loss": 4.8206, "loss/crossentropy": 1.8602584600448608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28300634026527405, "step": 2146 }, { "epoch": 0.04296, "grad_norm": 2.671875, "grad_norm_var": 0.019188435872395833, "learning_rate": 0.0001, "loss": 5.0525, "loss/crossentropy": 2.337582588195801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28086431324481964, "step": 2148 }, { "epoch": 0.043, "grad_norm": 2.796875, "grad_norm_var": 0.026439412434895834, "learning_rate": 0.0001, "loss": 5.2405, "loss/crossentropy": 2.2635254859924316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3113311231136322, "step": 2150 }, { "epoch": 0.04304, "grad_norm": 2.96875, "grad_norm_var": 0.0277984619140625, "learning_rate": 0.0001, "loss": 5.187, "loss/crossentropy": 2.3971948623657227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32285284996032715, "step": 2152 }, { "epoch": 0.04308, "grad_norm": 2.875, "grad_norm_var": 0.021891276041666668, "learning_rate": 0.0001, "loss": 5.1438, "loss/crossentropy": 1.8900776505470276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28163351118564606, "step": 2154 }, { "epoch": 0.04312, "grad_norm": 2.765625, "grad_norm_var": 0.020536295572916665, "learning_rate": 0.0001, "loss": 4.9774, "loss/crossentropy": 1.908443808555603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2901918590068817, "step": 2156 }, { "epoch": 0.04316, "grad_norm": 2.75, "grad_norm_var": 0.020409138997395833, "learning_rate": 0.0001, "loss": 4.7808, "loss/crossentropy": 1.8003268837928772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2762569487094879, "step": 2158 }, { "epoch": 0.0432, "grad_norm": 2.640625, "grad_norm_var": 0.0221832275390625, "learning_rate": 0.0001, "loss": 5.0477, "loss/crossentropy": 1.996739387512207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24626458436250687, "step": 2160 }, { "epoch": 0.04324, "grad_norm": 2.84375, "grad_norm_var": 0.018880208333333332, "learning_rate": 0.0001, "loss": 5.1737, "loss/crossentropy": 2.0175461173057556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31059183180332184, "step": 2162 }, { "epoch": 0.04328, "grad_norm": 2.6875, "grad_norm_var": 0.019266764322916668, "learning_rate": 0.0001, "loss": 5.0448, "loss/crossentropy": 2.0009909868240356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835536003112793, "step": 2164 }, { "epoch": 0.04332, "grad_norm": 2.65625, "grad_norm_var": 0.017967732747395833, "learning_rate": 0.0001, "loss": 4.9035, "loss/crossentropy": 1.9848785400390625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26657119393348694, "step": 2166 }, { "epoch": 0.04336, "grad_norm": 2.734375, "grad_norm_var": 0.01207275390625, "learning_rate": 0.0001, "loss": 4.9108, "loss/crossentropy": 2.076065957546234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2801993191242218, "step": 2168 }, { "epoch": 0.0434, "grad_norm": 2.828125, "grad_norm_var": 0.011617024739583334, "learning_rate": 0.0001, "loss": 5.1425, "loss/crossentropy": 2.1208528876304626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3152369260787964, "step": 2170 }, { "epoch": 0.04344, "grad_norm": 2.625, "grad_norm_var": 0.011400349934895833, "learning_rate": 0.0001, "loss": 5.0608, "loss/crossentropy": 2.1971306204795837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31046128273010254, "step": 2172 }, { "epoch": 0.04348, "grad_norm": 2.734375, "grad_norm_var": 0.013158162434895834, "learning_rate": 0.0001, "loss": 5.2445, "loss/crossentropy": 2.275176525115967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3090529441833496, "step": 2174 }, { "epoch": 0.04352, "grad_norm": 2.8125, "grad_norm_var": 0.01197509765625, "learning_rate": 0.0001, "loss": 4.9366, "loss/crossentropy": 2.1574501395225525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.280165433883667, "step": 2176 }, { "epoch": 0.04356, "grad_norm": 2.65625, "grad_norm_var": 0.012548828125, "learning_rate": 0.0001, "loss": 5.2338, "loss/crossentropy": 2.4236754179000854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3061629384756088, "step": 2178 }, { "epoch": 0.0436, "grad_norm": 2.78125, "grad_norm_var": 0.012809244791666667, "learning_rate": 0.0001, "loss": 5.1707, "loss/crossentropy": 2.1282758712768555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3050261586904526, "step": 2180 }, { "epoch": 0.04364, "grad_norm": 2.828125, "grad_norm_var": 0.008561197916666667, "learning_rate": 0.0001, "loss": 5.4629, "loss/crossentropy": 2.4244707822799683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33464157581329346, "step": 2182 }, { "epoch": 0.04368, "grad_norm": 2.640625, "grad_norm_var": 0.010904947916666666, "learning_rate": 0.0001, "loss": 5.3891, "loss/crossentropy": 2.289917469024658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3181813210248947, "step": 2184 }, { "epoch": 0.04372, "grad_norm": 2.890625, "grad_norm_var": 0.01031494140625, "learning_rate": 0.0001, "loss": 5.4207, "loss/crossentropy": 2.1540024280548096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.298043891787529, "step": 2186 }, { "epoch": 0.04376, "grad_norm": 2.609375, "grad_norm_var": 0.0145660400390625, "learning_rate": 0.0001, "loss": 4.8595, "loss/crossentropy": 1.6615915298461914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23485098034143448, "step": 2188 }, { "epoch": 0.0438, "grad_norm": 2.796875, "grad_norm_var": 0.01357421875, "learning_rate": 0.0001, "loss": 5.0595, "loss/crossentropy": 2.352560341358185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2965056747198105, "step": 2190 }, { "epoch": 0.04384, "grad_norm": 2.625, "grad_norm_var": 0.0183746337890625, "learning_rate": 0.0001, "loss": 5.1463, "loss/crossentropy": 2.0864007472991943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2859686613082886, "step": 2192 }, { "epoch": 0.04388, "grad_norm": 2.765625, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 5.2599, "loss/crossentropy": 1.8934992551803589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24994677305221558, "step": 2194 }, { "epoch": 0.04392, "grad_norm": 2.578125, "grad_norm_var": 0.02476806640625, "learning_rate": 0.0001, "loss": 4.9976, "loss/crossentropy": 2.2395824193954468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2719078063964844, "step": 2196 }, { "epoch": 0.04396, "grad_norm": 2.84375, "grad_norm_var": 0.02515869140625, "learning_rate": 0.0001, "loss": 5.2631, "loss/crossentropy": 2.089230954647064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2759791761636734, "step": 2198 }, { "epoch": 0.044, "grad_norm": 2.75, "grad_norm_var": 0.023567708333333333, "learning_rate": 0.0001, "loss": 5.298, "loss/crossentropy": 2.2770241498947144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31005042791366577, "step": 2200 }, { "epoch": 0.04404, "grad_norm": 2.46875, "grad_norm_var": 0.028938802083333333, "learning_rate": 0.0001, "loss": 4.6842, "loss/crossentropy": 2.067028760910034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30712655186653137, "step": 2202 }, { "epoch": 0.04408, "grad_norm": 2.46875, "grad_norm_var": 0.026395670572916665, "learning_rate": 0.0001, "loss": 5.0557, "loss/crossentropy": 2.4397774934768677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3508765548467636, "step": 2204 }, { "epoch": 0.04412, "grad_norm": 2.890625, "grad_norm_var": 0.028880818684895834, "learning_rate": 0.0001, "loss": 4.966, "loss/crossentropy": 1.8136217594146729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2495090439915657, "step": 2206 }, { "epoch": 0.04416, "grad_norm": 2.71875, "grad_norm_var": 0.022945149739583334, "learning_rate": 0.0001, "loss": 5.11, "loss/crossentropy": 2.4620203971862793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31419822573661804, "step": 2208 }, { "epoch": 0.0442, "grad_norm": 2.625, "grad_norm_var": 0.020042928059895833, "learning_rate": 0.0001, "loss": 4.9756, "loss/crossentropy": 1.8817986249923706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2705220878124237, "step": 2210 }, { "epoch": 0.04424, "grad_norm": 3.125, "grad_norm_var": 0.02769775390625, "learning_rate": 0.0001, "loss": 5.0069, "loss/crossentropy": 1.9593598246574402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.262384794652462, "step": 2212 }, { "epoch": 0.04428, "grad_norm": 2.484375, "grad_norm_var": 0.03303120930989583, "learning_rate": 0.0001, "loss": 4.9133, "loss/crossentropy": 2.1003851294517517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28965799510478973, "step": 2214 }, { "epoch": 0.04432, "grad_norm": 3.46875, "grad_norm_var": 0.07888081868489584, "learning_rate": 0.0001, "loss": 5.3243, "loss/crossentropy": 2.23227858543396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3120953291654587, "step": 2216 }, { "epoch": 0.04436, "grad_norm": 2.625, "grad_norm_var": 0.0726226806640625, "learning_rate": 0.0001, "loss": 5.0901, "loss/crossentropy": 1.8880399465560913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2740217447280884, "step": 2218 }, { "epoch": 0.0444, "grad_norm": 2.5, "grad_norm_var": 0.07111714680989584, "learning_rate": 0.0001, "loss": 4.9444, "loss/crossentropy": 2.132355511188507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27536119520664215, "step": 2220 }, { "epoch": 0.04444, "grad_norm": 2.578125, "grad_norm_var": 0.07419331868489583, "learning_rate": 0.0001, "loss": 4.7325, "loss/crossentropy": 1.831633746623993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26966987550258636, "step": 2222 }, { "epoch": 0.04448, "grad_norm": 2.703125, "grad_norm_var": 0.08056538899739583, "learning_rate": 0.0001, "loss": 5.004, "loss/crossentropy": 2.066656529903412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27303647994995117, "step": 2224 }, { "epoch": 0.04452, "grad_norm": 2.75, "grad_norm_var": 0.0809234619140625, "learning_rate": 0.0001, "loss": 4.9265, "loss/crossentropy": 2.1416667699813843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2781240791082382, "step": 2226 }, { "epoch": 0.04456, "grad_norm": 2.671875, "grad_norm_var": 0.07366536458333334, "learning_rate": 0.0001, "loss": 5.0701, "loss/crossentropy": 1.7953566908836365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27528999745845795, "step": 2228 }, { "epoch": 0.0446, "grad_norm": 2.84375, "grad_norm_var": 0.06728515625, "learning_rate": 0.0001, "loss": 5.0182, "loss/crossentropy": 2.1580333709716797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2998732179403305, "step": 2230 }, { "epoch": 0.04464, "grad_norm": 2.9375, "grad_norm_var": 0.034326171875, "learning_rate": 0.0001, "loss": 5.3619, "loss/crossentropy": 2.1685701608657837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3457205891609192, "step": 2232 }, { "epoch": 0.04468, "grad_norm": 2.578125, "grad_norm_var": 0.03345438639322917, "learning_rate": 0.0001, "loss": 4.7602, "loss/crossentropy": 1.9424286484718323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2728651314973831, "step": 2234 }, { "epoch": 0.04472, "grad_norm": 2.765625, "grad_norm_var": 0.029736328125, "learning_rate": 0.0001, "loss": 5.2351, "loss/crossentropy": 2.2802772521972656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30141082406044006, "step": 2236 }, { "epoch": 0.04476, "grad_norm": 2.515625, "grad_norm_var": 0.03277587890625, "learning_rate": 0.0001, "loss": 4.8906, "loss/crossentropy": 1.9490987062454224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2573640048503876, "step": 2238 }, { "epoch": 0.0448, "grad_norm": 3.171875, "grad_norm_var": 0.037679036458333336, "learning_rate": 0.0001, "loss": 5.2112, "loss/crossentropy": 1.993924081325531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25628305971622467, "step": 2240 }, { "epoch": 0.04484, "grad_norm": 2.6875, "grad_norm_var": 0.0359283447265625, "learning_rate": 0.0001, "loss": 5.268, "loss/crossentropy": 2.5151875019073486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.326447993516922, "step": 2242 }, { "epoch": 0.04488, "grad_norm": 2.609375, "grad_norm_var": 0.04641520182291667, "learning_rate": 0.0001, "loss": 5.0191, "loss/crossentropy": 2.5175565481185913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3036232739686966, "step": 2244 }, { "epoch": 0.04492, "grad_norm": 2.546875, "grad_norm_var": 0.05388997395833333, "learning_rate": 0.0001, "loss": 4.8489, "loss/crossentropy": 2.020721971988678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28087201714515686, "step": 2246 }, { "epoch": 0.04496, "grad_norm": 2.96875, "grad_norm_var": 0.045703125, "learning_rate": 0.0001, "loss": 5.6809, "loss/crossentropy": 2.4800511598587036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3100287467241287, "step": 2248 }, { "epoch": 0.045, "grad_norm": 2.59375, "grad_norm_var": 0.0449127197265625, "learning_rate": 0.0001, "loss": 4.9055, "loss/crossentropy": 1.826172411441803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2782330811023712, "step": 2250 }, { "epoch": 0.04504, "grad_norm": 2.828125, "grad_norm_var": 0.04533589680989583, "learning_rate": 0.0001, "loss": 5.133, "loss/crossentropy": 2.256316304206848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3070906698703766, "step": 2252 }, { "epoch": 0.04508, "grad_norm": 2.765625, "grad_norm_var": 0.041402180989583336, "learning_rate": 0.0001, "loss": 5.173, "loss/crossentropy": 1.9046601057052612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2894355356693268, "step": 2254 }, { "epoch": 0.04512, "grad_norm": 2.828125, "grad_norm_var": 0.03288472493489583, "learning_rate": 0.0001, "loss": 5.0311, "loss/crossentropy": 1.8359373211860657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2705196440219879, "step": 2256 }, { "epoch": 0.04516, "grad_norm": 2.96875, "grad_norm_var": 0.0349273681640625, "learning_rate": 0.0001, "loss": 4.717, "loss/crossentropy": 2.096512258052826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26635921001434326, "step": 2258 }, { "epoch": 0.0452, "grad_norm": 2.984375, "grad_norm_var": 0.031371053059895834, "learning_rate": 0.0001, "loss": 5.6736, "loss/crossentropy": 2.4621278047561646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3513137400150299, "step": 2260 }, { "epoch": 0.04524, "grad_norm": 2.828125, "grad_norm_var": 0.020442708333333334, "learning_rate": 0.0001, "loss": 5.1413, "loss/crossentropy": 1.8345229029655457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2414976954460144, "step": 2262 }, { "epoch": 0.04528, "grad_norm": 2.65625, "grad_norm_var": 0.026432291666666666, "learning_rate": 0.0001, "loss": 4.9746, "loss/crossentropy": 2.24505877494812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31794628500938416, "step": 2264 }, { "epoch": 0.04532, "grad_norm": 2.609375, "grad_norm_var": 0.0285552978515625, "learning_rate": 0.0001, "loss": 5.1691, "loss/crossentropy": 2.2141382694244385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2925301343202591, "step": 2266 }, { "epoch": 0.04536, "grad_norm": 3.1875, "grad_norm_var": 0.03681233723958333, "learning_rate": 0.0001, "loss": 5.0559, "loss/crossentropy": 2.1515613794326782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2843547910451889, "step": 2268 }, { "epoch": 0.0454, "grad_norm": 2.765625, "grad_norm_var": 0.03422749837239583, "learning_rate": 0.0001, "loss": 4.6899, "loss/crossentropy": 2.1234883666038513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2981575280427933, "step": 2270 }, { "epoch": 0.04544, "grad_norm": 2.65625, "grad_norm_var": 0.03806864420572917, "learning_rate": 0.0001, "loss": 4.9871, "loss/crossentropy": 2.1212490797042847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.276496559381485, "step": 2272 }, { "epoch": 0.04548, "grad_norm": 2.921875, "grad_norm_var": 0.03574930826822917, "learning_rate": 0.0001, "loss": 5.2925, "loss/crossentropy": 2.4330636262893677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2815839499235153, "step": 2274 }, { "epoch": 0.04552, "grad_norm": 2.65625, "grad_norm_var": 0.028238932291666668, "learning_rate": 0.0001, "loss": 5.2874, "loss/crossentropy": 2.110591411590576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2832919806241989, "step": 2276 }, { "epoch": 0.04556, "grad_norm": 2.875, "grad_norm_var": 0.20754801432291667, "learning_rate": 0.0001, "loss": 5.0303, "loss/crossentropy": 2.231989800930023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28183089196681976, "step": 2278 }, { "epoch": 0.0456, "grad_norm": 2.84375, "grad_norm_var": 0.194873046875, "learning_rate": 0.0001, "loss": 5.3746, "loss/crossentropy": 2.1275558471679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29279497265815735, "step": 2280 }, { "epoch": 0.04564, "grad_norm": 2.8125, "grad_norm_var": 0.1891510009765625, "learning_rate": 0.0001, "loss": 5.2023, "loss/crossentropy": 1.7988306283950806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2625636160373688, "step": 2282 }, { "epoch": 0.04568, "grad_norm": 2.65625, "grad_norm_var": 0.18694254557291667, "learning_rate": 0.0001, "loss": 5.2017, "loss/crossentropy": 2.3405990600585938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30855217576026917, "step": 2284 }, { "epoch": 0.04572, "grad_norm": 2.828125, "grad_norm_var": 0.18612874348958333, "learning_rate": 0.0001, "loss": 5.4582, "loss/crossentropy": 2.2062121629714966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30755001306533813, "step": 2286 }, { "epoch": 0.04576, "grad_norm": 2.5625, "grad_norm_var": 0.18968098958333332, "learning_rate": 0.0001, "loss": 4.8984, "loss/crossentropy": 1.9439310431480408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26844222843647003, "step": 2288 }, { "epoch": 0.0458, "grad_norm": 2.78125, "grad_norm_var": 0.19010416666666666, "learning_rate": 0.0001, "loss": 5.2097, "loss/crossentropy": 2.3106162548065186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30821681022644043, "step": 2290 }, { "epoch": 0.04584, "grad_norm": 2.640625, "grad_norm_var": 0.19246317545572916, "learning_rate": 0.0001, "loss": 5.1401, "loss/crossentropy": 2.3809561729431152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3307010233402252, "step": 2292 }, { "epoch": 0.04588, "grad_norm": 2.703125, "grad_norm_var": 0.01103515625, "learning_rate": 0.0001, "loss": 5.4066, "loss/crossentropy": 2.209702253341675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2795914113521576, "step": 2294 }, { "epoch": 0.04592, "grad_norm": 2.5625, "grad_norm_var": 0.0158111572265625, "learning_rate": 0.0001, "loss": 4.8772, "loss/crossentropy": 2.4084372520446777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31208573281764984, "step": 2296 }, { "epoch": 0.04596, "grad_norm": 2.6875, "grad_norm_var": 0.0146392822265625, "learning_rate": 0.0001, "loss": 4.7757, "loss/crossentropy": 1.9384723901748657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2625032365322113, "step": 2298 }, { "epoch": 0.046, "grad_norm": 2.78125, "grad_norm_var": 0.013704427083333333, "learning_rate": 0.0001, "loss": 5.2455, "loss/crossentropy": 2.150592088699341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.278359517455101, "step": 2300 }, { "epoch": 0.04604, "grad_norm": 2.65625, "grad_norm_var": 0.01256103515625, "learning_rate": 0.0001, "loss": 5.0788, "loss/crossentropy": 1.8317970037460327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2927433103322983, "step": 2302 }, { "epoch": 0.04608, "grad_norm": 3.09375, "grad_norm_var": 0.029564412434895833, "learning_rate": 0.0001, "loss": 5.091, "loss/crossentropy": 2.323367118835449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2943577915430069, "step": 2304 }, { "epoch": 0.04612, "grad_norm": 2.859375, "grad_norm_var": 0.03943583170572917, "learning_rate": 0.0001, "loss": 5.5301, "loss/crossentropy": 2.369907855987549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2990037202835083, "step": 2306 }, { "epoch": 0.04616, "grad_norm": 2.921875, "grad_norm_var": 0.04127197265625, "learning_rate": 0.0001, "loss": 4.7508, "loss/crossentropy": 1.691443145275116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27280642092227936, "step": 2308 }, { "epoch": 0.0462, "grad_norm": 2.71875, "grad_norm_var": 0.0464508056640625, "learning_rate": 0.0001, "loss": 4.9413, "loss/crossentropy": 2.2883838415145874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3064710944890976, "step": 2310 }, { "epoch": 0.04624, "grad_norm": 2.671875, "grad_norm_var": 0.035380045572916664, "learning_rate": 0.0001, "loss": 5.4165, "loss/crossentropy": 2.2042444944381714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3241504430770874, "step": 2312 }, { "epoch": 0.04628, "grad_norm": 2.609375, "grad_norm_var": 0.048680623372395836, "learning_rate": 0.0001, "loss": 4.6657, "loss/crossentropy": 1.977793574333191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775610163807869, "step": 2314 }, { "epoch": 0.04632, "grad_norm": 24.875, "grad_norm_var": 30.570881144205728, "learning_rate": 0.0001, "loss": 5.8585, "loss/crossentropy": 2.034530758857727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27170561254024506, "step": 2316 }, { "epoch": 0.04636, "grad_norm": 2.875, "grad_norm_var": 30.404881795247395, "learning_rate": 0.0001, "loss": 5.1565, "loss/crossentropy": 2.439123511314392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2999258190393448, "step": 2318 }, { "epoch": 0.0464, "grad_norm": 2.5, "grad_norm_var": 30.530557250976564, "learning_rate": 0.0001, "loss": 4.8907, "loss/crossentropy": 2.2600624561309814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2952795475721359, "step": 2320 }, { "epoch": 0.04644, "grad_norm": 2.859375, "grad_norm_var": 30.544131469726562, "learning_rate": 0.0001, "loss": 5.0521, "loss/crossentropy": 2.144679367542267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28713342547416687, "step": 2322 }, { "epoch": 0.04648, "grad_norm": 2.890625, "grad_norm_var": 30.469155883789064, "learning_rate": 0.0001, "loss": 5.5054, "loss/crossentropy": 2.34474778175354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.292633980512619, "step": 2324 }, { "epoch": 0.04652, "grad_norm": 2.609375, "grad_norm_var": 30.491536458333332, "learning_rate": 0.0001, "loss": 4.5903, "loss/crossentropy": 1.96743243932724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.265610933303833, "step": 2326 }, { "epoch": 0.04656, "grad_norm": 2.53125, "grad_norm_var": 30.524051920572916, "learning_rate": 0.0001, "loss": 5.2353, "loss/crossentropy": 2.3895785808563232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3287513107061386, "step": 2328 }, { "epoch": 0.0466, "grad_norm": 2.984375, "grad_norm_var": 30.446451822916668, "learning_rate": 0.0001, "loss": 4.9017, "loss/crossentropy": 2.0607098937034607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30049796402454376, "step": 2330 }, { "epoch": 0.04664, "grad_norm": 2.96875, "grad_norm_var": 0.08322652180989583, "learning_rate": 0.0001, "loss": 5.1698, "loss/crossentropy": 2.162124752998352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2983020693063736, "step": 2332 }, { "epoch": 0.04668, "grad_norm": 2.890625, "grad_norm_var": 0.05139567057291667, "learning_rate": 0.0001, "loss": 5.2702, "loss/crossentropy": 2.34970760345459, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3322293907403946, "step": 2334 }, { "epoch": 0.04672, "grad_norm": 2.703125, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 4.9763, "loss/crossentropy": 1.9286972284317017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2752893418073654, "step": 2336 }, { "epoch": 0.04676, "grad_norm": 2.828125, "grad_norm_var": 0.04159749348958333, "learning_rate": 0.0001, "loss": 5.1677, "loss/crossentropy": 2.182044267654419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30808278918266296, "step": 2338 }, { "epoch": 0.0468, "grad_norm": 2.96875, "grad_norm_var": 0.03916727701822917, "learning_rate": 0.0001, "loss": 5.28, "loss/crossentropy": 2.0002610087394714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2944178581237793, "step": 2340 }, { "epoch": 0.04684, "grad_norm": 2.609375, "grad_norm_var": 0.034830729166666664, "learning_rate": 0.0001, "loss": 4.984, "loss/crossentropy": 2.0721842646598816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2769011855125427, "step": 2342 }, { "epoch": 0.04688, "grad_norm": 2.625, "grad_norm_var": 0.03178609212239583, "learning_rate": 0.0001, "loss": 5.0911, "loss/crossentropy": 1.9710460305213928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28619086742401123, "step": 2344 }, { "epoch": 0.04692, "grad_norm": 3.28125, "grad_norm_var": 1.54605712890625, "learning_rate": 0.0001, "loss": 5.5092, "loss/crossentropy": 2.0506762266159058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31705181300640106, "step": 2346 }, { "epoch": 0.04696, "grad_norm": 2.609375, "grad_norm_var": 1.5601064046223958, "learning_rate": 0.0001, "loss": 5.0045, "loss/crossentropy": 2.0095930695533752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2831149846315384, "step": 2348 }, { "epoch": 0.047, "grad_norm": 2.890625, "grad_norm_var": 1.573631795247396, "learning_rate": 0.0001, "loss": 5.19, "loss/crossentropy": 2.023163616657257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2813292294740677, "step": 2350 }, { "epoch": 0.04704, "grad_norm": 2.671875, "grad_norm_var": 1.561424763997396, "learning_rate": 0.0001, "loss": 5.2907, "loss/crossentropy": 2.230435371398926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30891451239585876, "step": 2352 }, { "epoch": 0.04708, "grad_norm": 3.0625, "grad_norm_var": 1.5700154622395834, "learning_rate": 0.0001, "loss": 4.9261, "loss/crossentropy": 2.1553521156311035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29839709401130676, "step": 2354 }, { "epoch": 0.04712, "grad_norm": 2.96875, "grad_norm_var": 1.5770792643229166, "learning_rate": 0.0001, "loss": 5.2553, "loss/crossentropy": 2.175648272037506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2972148358821869, "step": 2356 }, { "epoch": 0.04716, "grad_norm": 2.703125, "grad_norm_var": 1.5980377197265625, "learning_rate": 0.0001, "loss": 5.0436, "loss/crossentropy": 2.3852503299713135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3166612535715103, "step": 2358 }, { "epoch": 0.0472, "grad_norm": 2.75, "grad_norm_var": 1.5826456705729166, "learning_rate": 0.0001, "loss": 5.1827, "loss/crossentropy": 2.1905999183654785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29487256705760956, "step": 2360 }, { "epoch": 0.04724, "grad_norm": 2.5625, "grad_norm_var": 0.0413482666015625, "learning_rate": 0.0001, "loss": 4.9295, "loss/crossentropy": 1.9224759340286255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27821336686611176, "step": 2362 }, { "epoch": 0.04728, "grad_norm": 3.109375, "grad_norm_var": 0.04057515462239583, "learning_rate": 0.0001, "loss": 5.1078, "loss/crossentropy": 2.5025261640548706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3345927745103836, "step": 2364 }, { "epoch": 0.04732, "grad_norm": 2.640625, "grad_norm_var": 0.04006245930989583, "learning_rate": 0.0001, "loss": 5.0502, "loss/crossentropy": 2.2385451793670654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27095621824264526, "step": 2366 }, { "epoch": 0.04736, "grad_norm": 4.5, "grad_norm_var": 0.22924702962239582, "learning_rate": 0.0001, "loss": 5.2761, "loss/crossentropy": 2.0266553163528442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2856762409210205, "step": 2368 }, { "epoch": 0.0474, "grad_norm": 3.359375, "grad_norm_var": 0.24001363118489583, "learning_rate": 0.0001, "loss": 5.4918, "loss/crossentropy": 2.5139355659484863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3186872750520706, "step": 2370 }, { "epoch": 0.04744, "grad_norm": 2.609375, "grad_norm_var": 0.24321187337239583, "learning_rate": 0.0001, "loss": 5.1302, "loss/crossentropy": 2.066476881504059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2889470160007477, "step": 2372 }, { "epoch": 0.04748, "grad_norm": 2.671875, "grad_norm_var": 0.23772786458333334, "learning_rate": 0.0001, "loss": 5.0031, "loss/crossentropy": 2.0537307262420654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31795741617679596, "step": 2374 }, { "epoch": 0.04752, "grad_norm": 2.546875, "grad_norm_var": 0.24492085774739583, "learning_rate": 0.0001, "loss": 4.9922, "loss/crossentropy": 1.9254986643791199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25411880016326904, "step": 2376 }, { "epoch": 0.04756, "grad_norm": 2.671875, "grad_norm_var": 0.23855692545572918, "learning_rate": 0.0001, "loss": 5.0284, "loss/crossentropy": 2.221043348312378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28534361720085144, "step": 2378 }, { "epoch": 0.0476, "grad_norm": 2.90625, "grad_norm_var": 0.228271484375, "learning_rate": 0.0001, "loss": 5.2106, "loss/crossentropy": 2.3516281843185425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3128499984741211, "step": 2380 }, { "epoch": 0.04764, "grad_norm": 2.734375, "grad_norm_var": 0.22349853515625, "learning_rate": 0.0001, "loss": 5.6266, "loss/crossentropy": 2.2144338488578796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3071902245283127, "step": 2382 }, { "epoch": 0.04768, "grad_norm": 2.625, "grad_norm_var": 0.0567291259765625, "learning_rate": 0.0001, "loss": 5.2429, "loss/crossentropy": 2.3324203491210938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.322970449924469, "step": 2384 }, { "epoch": 0.04772, "grad_norm": 2.546875, "grad_norm_var": 0.03332926432291667, "learning_rate": 0.0001, "loss": 4.7732, "loss/crossentropy": 2.08349871635437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2774003893136978, "step": 2386 }, { "epoch": 0.04776, "grad_norm": 2.4375, "grad_norm_var": 0.03902587890625, "learning_rate": 0.0001, "loss": 4.8585, "loss/crossentropy": 1.9565780758857727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29790589213371277, "step": 2388 }, { "epoch": 0.0478, "grad_norm": 2.625, "grad_norm_var": 0.022782389322916666, "learning_rate": 0.0001, "loss": 5.0176, "loss/crossentropy": 2.261398434638977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3143853694200516, "step": 2390 }, { "epoch": 0.04784, "grad_norm": 2.96875, "grad_norm_var": 0.024494425455729166, "learning_rate": 0.0001, "loss": 5.0688, "loss/crossentropy": 1.9077460169792175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25433091819286346, "step": 2392 }, { "epoch": 0.04788, "grad_norm": 2.65625, "grad_norm_var": 0.0240142822265625, "learning_rate": 0.0001, "loss": 4.9531, "loss/crossentropy": 1.9948468208312988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598777562379837, "step": 2394 }, { "epoch": 0.04792, "grad_norm": 2.9375, "grad_norm_var": 0.45455729166666664, "learning_rate": 0.0001, "loss": 5.0972, "loss/crossentropy": 2.1177526116371155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775426208972931, "step": 2396 }, { "epoch": 0.04796, "grad_norm": 2.5, "grad_norm_var": 0.4634348551432292, "learning_rate": 0.0001, "loss": 4.8571, "loss/crossentropy": 2.1756062507629395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33678852021694183, "step": 2398 }, { "epoch": 0.048, "grad_norm": 2.984375, "grad_norm_var": 0.4576171875, "learning_rate": 0.0001, "loss": 5.2617, "loss/crossentropy": 2.0923725366592407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3255026638507843, "step": 2400 }, { "epoch": 0.04804, "grad_norm": 2.84375, "grad_norm_var": 0.4471181233723958, "learning_rate": 0.0001, "loss": 4.9421, "loss/crossentropy": 1.9236682653427124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2642124071717262, "step": 2402 }, { "epoch": 0.04808, "grad_norm": 2.84375, "grad_norm_var": 0.43757222493489584, "learning_rate": 0.0001, "loss": 5.0604, "loss/crossentropy": 2.1742242574691772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30006398260593414, "step": 2404 }, { "epoch": 0.04812, "grad_norm": 2.6875, "grad_norm_var": 0.43835347493489585, "learning_rate": 0.0001, "loss": 4.7077, "loss/crossentropy": 1.7445701956748962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25791122019290924, "step": 2406 }, { "epoch": 0.04816, "grad_norm": 4.34375, "grad_norm_var": 0.5614735921223958, "learning_rate": 0.0001, "loss": 5.1289, "loss/crossentropy": 1.8616467714309692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532489001750946, "step": 2408 }, { "epoch": 0.0482, "grad_norm": 3.203125, "grad_norm_var": 0.5608723958333334, "learning_rate": 0.0001, "loss": 5.0486, "loss/crossentropy": 1.9146783351898193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2659924626350403, "step": 2410 }, { "epoch": 0.04824, "grad_norm": 2.859375, "grad_norm_var": 0.21818745930989583, "learning_rate": 0.0001, "loss": 5.0972, "loss/crossentropy": 2.179564118385315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29064056277275085, "step": 2412 }, { "epoch": 0.04828, "grad_norm": 2.65625, "grad_norm_var": 0.21357421875, "learning_rate": 0.0001, "loss": 4.9578, "loss/crossentropy": 2.04409658908844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27891072630882263, "step": 2414 }, { "epoch": 0.04832, "grad_norm": 2.625, "grad_norm_var": 0.22568257649739584, "learning_rate": 0.0001, "loss": 4.8833, "loss/crossentropy": 2.590337038040161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3158426731824875, "step": 2416 }, { "epoch": 0.04836, "grad_norm": 2.625, "grad_norm_var": 0.23155924479166667, "learning_rate": 0.0001, "loss": 4.6919, "loss/crossentropy": 1.8753941059112549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2669401317834854, "step": 2418 }, { "epoch": 0.0484, "grad_norm": 2.734375, "grad_norm_var": 0.23361714680989584, "learning_rate": 0.0001, "loss": 5.0231, "loss/crossentropy": 2.1412659287452698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2839512526988983, "step": 2420 }, { "epoch": 0.04844, "grad_norm": 2.65625, "grad_norm_var": 0.23371988932291668, "learning_rate": 0.0001, "loss": 5.1187, "loss/crossentropy": 2.545991063117981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3331379294395447, "step": 2422 }, { "epoch": 0.04848, "grad_norm": 3.25, "grad_norm_var": 0.089599609375, "learning_rate": 0.0001, "loss": 5.1246, "loss/crossentropy": 2.12838077545166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3043065369129181, "step": 2424 }, { "epoch": 0.04852, "grad_norm": 2.953125, "grad_norm_var": 0.03660380045572917, "learning_rate": 0.0001, "loss": 5.3821, "loss/crossentropy": 2.1983221769332886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2846619784832001, "step": 2426 }, { "epoch": 0.04856, "grad_norm": 3.0625, "grad_norm_var": 0.03819986979166667, "learning_rate": 0.0001, "loss": 5.1044, "loss/crossentropy": 2.241136312484741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30275705456733704, "step": 2428 }, { "epoch": 0.0486, "grad_norm": 2.890625, "grad_norm_var": 0.03831278483072917, "learning_rate": 0.0001, "loss": 5.203, "loss/crossentropy": 2.097459554672241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2986721396446228, "step": 2430 }, { "epoch": 0.04864, "grad_norm": 4.4375, "grad_norm_var": 0.20159403483072916, "learning_rate": 0.0001, "loss": 5.1158, "loss/crossentropy": 2.333081007003784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2975248098373413, "step": 2432 }, { "epoch": 0.04868, "grad_norm": 2.625, "grad_norm_var": 0.19758707682291668, "learning_rate": 0.0001, "loss": 4.9183, "loss/crossentropy": 2.3171510696411133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28452740609645844, "step": 2434 }, { "epoch": 0.04872, "grad_norm": 2.375, "grad_norm_var": 0.2141754150390625, "learning_rate": 0.0001, "loss": 4.8853, "loss/crossentropy": 1.8334497213363647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25271379947662354, "step": 2436 }, { "epoch": 0.04876, "grad_norm": 2.78125, "grad_norm_var": 0.20258687337239584, "learning_rate": 0.0001, "loss": 5.3809, "loss/crossentropy": 2.2712661027908325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30827929079532623, "step": 2438 }, { "epoch": 0.0488, "grad_norm": 3.3125, "grad_norm_var": 0.20465087890625, "learning_rate": 0.0001, "loss": 5.0863, "loss/crossentropy": 2.160263180732727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3078690320253372, "step": 2440 }, { "epoch": 0.04884, "grad_norm": 2.96875, "grad_norm_var": 0.20429585774739584, "learning_rate": 0.0001, "loss": 5.2224, "loss/crossentropy": 2.071319878101349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2733730524778366, "step": 2442 }, { "epoch": 0.04888, "grad_norm": 2.796875, "grad_norm_var": 0.203076171875, "learning_rate": 0.0001, "loss": 5.1476, "loss/crossentropy": 2.0742560029029846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835587412118912, "step": 2444 }, { "epoch": 0.04892, "grad_norm": 2.765625, "grad_norm_var": 0.24807535807291667, "learning_rate": 0.0001, "loss": 5.0211, "loss/crossentropy": 1.8836837410926819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3089100867509842, "step": 2446 }, { "epoch": 0.04896, "grad_norm": 2.875, "grad_norm_var": 0.177099609375, "learning_rate": 0.0001, "loss": 4.9537, "loss/crossentropy": 1.8539315462112427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2697101980447769, "step": 2448 }, { "epoch": 0.049, "grad_norm": 2.828125, "grad_norm_var": 0.17392578125, "learning_rate": 0.0001, "loss": 5.1907, "loss/crossentropy": 2.219490647315979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2898380011320114, "step": 2450 }, { "epoch": 0.04904, "grad_norm": 3.1875, "grad_norm_var": 0.13853251139322917, "learning_rate": 0.0001, "loss": 5.468, "loss/crossentropy": 2.328765392303467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4013051837682724, "step": 2452 }, { "epoch": 0.04908, "grad_norm": 2.5625, "grad_norm_var": 0.14879557291666667, "learning_rate": 0.0001, "loss": 4.8967, "loss/crossentropy": 1.9204192161560059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25281261652708054, "step": 2454 }, { "epoch": 0.04912, "grad_norm": 2.765625, "grad_norm_var": 0.15563151041666667, "learning_rate": 0.0001, "loss": 5.0935, "loss/crossentropy": 2.377043604850769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2928486764431, "step": 2456 }, { "epoch": 0.04916, "grad_norm": 2.796875, "grad_norm_var": 0.15930989583333333, "learning_rate": 0.0001, "loss": 5.4528, "loss/crossentropy": 2.4364209175109863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3111976683139801, "step": 2458 }, { "epoch": 0.0492, "grad_norm": 2.703125, "grad_norm_var": 0.15985921223958333, "learning_rate": 0.0001, "loss": 5.1357, "loss/crossentropy": 2.3738330602645874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3019126206636429, "step": 2460 }, { "epoch": 0.04924, "grad_norm": 2.765625, "grad_norm_var": 0.1141265869140625, "learning_rate": 0.0001, "loss": 5.2876, "loss/crossentropy": 2.4575772285461426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3428986072540283, "step": 2462 }, { "epoch": 0.04928, "grad_norm": 2.734375, "grad_norm_var": 0.028888956705729166, "learning_rate": 0.0001, "loss": 4.6505, "loss/crossentropy": 1.9849627017974854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27611130475997925, "step": 2464 }, { "epoch": 0.04932, "grad_norm": 2.625, "grad_norm_var": 0.030402628580729167, "learning_rate": 0.0001, "loss": 4.8482, "loss/crossentropy": 1.9617170691490173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.257377490401268, "step": 2466 }, { "epoch": 0.04936, "grad_norm": 2.640625, "grad_norm_var": 0.01871337890625, "learning_rate": 0.0001, "loss": 5.1739, "loss/crossentropy": 2.4031273126602173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28843145072460175, "step": 2468 }, { "epoch": 0.0494, "grad_norm": 2.484375, "grad_norm_var": 0.0165435791015625, "learning_rate": 0.0001, "loss": 4.8452, "loss/crossentropy": 1.7263792753219604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25591571629047394, "step": 2470 }, { "epoch": 0.04944, "grad_norm": 2.734375, "grad_norm_var": 0.0191802978515625, "learning_rate": 0.0001, "loss": 4.7154, "loss/crossentropy": 2.106898784637451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.280623197555542, "step": 2472 }, { "epoch": 0.04948, "grad_norm": 2.6875, "grad_norm_var": 0.018358357747395835, "learning_rate": 0.0001, "loss": 5.0092, "loss/crossentropy": 2.325208902359009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27794161438941956, "step": 2474 }, { "epoch": 0.04952, "grad_norm": 2.65625, "grad_norm_var": 0.013736979166666666, "learning_rate": 0.0001, "loss": 5.1128, "loss/crossentropy": 2.367414712905884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31346653401851654, "step": 2476 }, { "epoch": 0.04956, "grad_norm": 2.59375, "grad_norm_var": 0.013792928059895833, "learning_rate": 0.0001, "loss": 5.2611, "loss/crossentropy": 2.191115140914917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30560287833213806, "step": 2478 }, { "epoch": 0.0496, "grad_norm": 2.765625, "grad_norm_var": 0.012430826822916666, "learning_rate": 0.0001, "loss": 4.9993, "loss/crossentropy": 2.2559624314308167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3029457628726959, "step": 2480 }, { "epoch": 0.04964, "grad_norm": 3.390625, "grad_norm_var": 0.04160868326822917, "learning_rate": 0.0001, "loss": 5.125, "loss/crossentropy": 2.0088382363319397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2923784404993057, "step": 2482 }, { "epoch": 0.04968, "grad_norm": 2.71875, "grad_norm_var": 0.03943583170572917, "learning_rate": 0.0001, "loss": 4.8858, "loss/crossentropy": 1.8445329070091248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2587483897805214, "step": 2484 }, { "epoch": 0.04972, "grad_norm": 2.8125, "grad_norm_var": 0.03534749348958333, "learning_rate": 0.0001, "loss": 4.7918, "loss/crossentropy": 2.015140950679779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27480585873126984, "step": 2486 }, { "epoch": 0.04976, "grad_norm": 2.765625, "grad_norm_var": 0.030833943684895834, "learning_rate": 0.0001, "loss": 5.1959, "loss/crossentropy": 1.918801188468933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28786011040210724, "step": 2488 }, { "epoch": 0.0498, "grad_norm": 2.734375, "grad_norm_var": 0.030269368489583334, "learning_rate": 0.0001, "loss": 5.0108, "loss/crossentropy": 1.9899121522903442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25822708010673523, "step": 2490 }, { "epoch": 0.04984, "grad_norm": 2.59375, "grad_norm_var": 0.037206013997395836, "learning_rate": 0.0001, "loss": 4.7259, "loss/crossentropy": 2.3775535821914673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31345370411872864, "step": 2492 }, { "epoch": 0.04988, "grad_norm": 2.71875, "grad_norm_var": 0.03791402180989583, "learning_rate": 0.0001, "loss": 4.9496, "loss/crossentropy": 2.0874632596969604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2836592495441437, "step": 2494 }, { "epoch": 0.04992, "grad_norm": 2.78125, "grad_norm_var": 0.046019490559895834, "learning_rate": 0.0001, "loss": 5.2404, "loss/crossentropy": 2.226976454257965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27826404571533203, "step": 2496 }, { "epoch": 0.04996, "grad_norm": 2.71875, "grad_norm_var": 0.023900349934895832, "learning_rate": 0.0001, "loss": 5.1503, "loss/crossentropy": 2.4569294452667236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31981319189071655, "step": 2498 }, { "epoch": 0.05, "grad_norm": 2.703125, "grad_norm_var": 0.025581868489583333, "learning_rate": 0.0001, "loss": 4.922, "loss/crossentropy": 2.1134212017059326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28260529041290283, "step": 2500 }, { "epoch": 0.05004, "grad_norm": 3.0, "grad_norm_var": 0.029195149739583332, "learning_rate": 0.0001, "loss": 5.3555, "loss/crossentropy": 2.2914888858795166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3182682394981384, "step": 2502 }, { "epoch": 0.05008, "grad_norm": 2.65625, "grad_norm_var": 0.030078125, "learning_rate": 0.0001, "loss": 4.9644, "loss/crossentropy": 2.3261003494262695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30558250844478607, "step": 2504 }, { "epoch": 0.05012, "grad_norm": 2.703125, "grad_norm_var": 0.030598958333333332, "learning_rate": 0.0001, "loss": 4.9517, "loss/crossentropy": 2.351989507675171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2934701144695282, "step": 2506 }, { "epoch": 0.05016, "grad_norm": 2.5625, "grad_norm_var": 0.025031534830729167, "learning_rate": 0.0001, "loss": 4.71, "loss/crossentropy": 1.8742690086364746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26305729895830154, "step": 2508 }, { "epoch": 0.0502, "grad_norm": 2.78125, "grad_norm_var": 0.0247222900390625, "learning_rate": 0.0001, "loss": 4.8214, "loss/crossentropy": 2.1668856143951416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2728075534105301, "step": 2510 }, { "epoch": 0.05024, "grad_norm": 2.734375, "grad_norm_var": 0.0137115478515625, "learning_rate": 0.0001, "loss": 4.7171, "loss/crossentropy": 1.773424208164215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26808495819568634, "step": 2512 }, { "epoch": 0.05028, "grad_norm": 2.59375, "grad_norm_var": 0.013109334309895833, "learning_rate": 0.0001, "loss": 4.9224, "loss/crossentropy": 1.7541643977165222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2600491940975189, "step": 2514 }, { "epoch": 0.05032, "grad_norm": 2.796875, "grad_norm_var": 0.011324055989583333, "learning_rate": 0.0001, "loss": 5.129, "loss/crossentropy": 1.9693496227264404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2844541072845459, "step": 2516 }, { "epoch": 0.05036, "grad_norm": 2.765625, "grad_norm_var": 0.009065755208333333, "learning_rate": 0.0001, "loss": 4.9202, "loss/crossentropy": 1.7539461851119995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2542608380317688, "step": 2518 }, { "epoch": 0.0504, "grad_norm": 2.984375, "grad_norm_var": 0.017878214518229168, "learning_rate": 0.0001, "loss": 5.0686, "loss/crossentropy": 2.155138313770294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3023010194301605, "step": 2520 }, { "epoch": 0.05044, "grad_norm": 2.609375, "grad_norm_var": 0.018619791666666666, "learning_rate": 0.0001, "loss": 4.9674, "loss/crossentropy": 2.069350838661194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26573850214481354, "step": 2522 }, { "epoch": 0.05048, "grad_norm": 2.5, "grad_norm_var": 0.022101847330729167, "learning_rate": 0.0001, "loss": 5.0295, "loss/crossentropy": 2.1864534616470337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27845603227615356, "step": 2524 }, { "epoch": 0.05052, "grad_norm": 2.5, "grad_norm_var": 0.023509724934895834, "learning_rate": 0.0001, "loss": 5.1173, "loss/crossentropy": 2.2462135553359985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29221346974372864, "step": 2526 }, { "epoch": 0.05056, "grad_norm": 2.78125, "grad_norm_var": 0.030492146809895832, "learning_rate": 0.0001, "loss": 5.0027, "loss/crossentropy": 2.043266773223877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25857964158058167, "step": 2528 }, { "epoch": 0.0506, "grad_norm": 2.65625, "grad_norm_var": 0.0299957275390625, "learning_rate": 0.0001, "loss": 4.9672, "loss/crossentropy": 1.8892702460289001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2879898101091385, "step": 2530 }, { "epoch": 0.05064, "grad_norm": 2.46875, "grad_norm_var": 0.03238525390625, "learning_rate": 0.0001, "loss": 4.5332, "loss/crossentropy": 1.9220558404922485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26652154326438904, "step": 2532 }, { "epoch": 0.05068, "grad_norm": 2.515625, "grad_norm_var": 0.03430989583333333, "learning_rate": 0.0001, "loss": 4.4176, "loss/crossentropy": 1.7282914519309998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2367371916770935, "step": 2534 }, { "epoch": 0.05072, "grad_norm": 2.78125, "grad_norm_var": 0.021092732747395832, "learning_rate": 0.0001, "loss": 4.9589, "loss/crossentropy": 1.729803204536438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25624871999025345, "step": 2536 }, { "epoch": 0.05076, "grad_norm": 2.578125, "grad_norm_var": 0.020873006184895834, "learning_rate": 0.0001, "loss": 4.6952, "loss/crossentropy": 2.0921449661254883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2855897545814514, "step": 2538 }, { "epoch": 0.0508, "grad_norm": 2.6875, "grad_norm_var": 0.015404256184895833, "learning_rate": 0.0001, "loss": 4.8967, "loss/crossentropy": 2.0569751858711243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2829667925834656, "step": 2540 }, { "epoch": 0.05084, "grad_norm": 2.609375, "grad_norm_var": 0.015973917643229165, "learning_rate": 0.0001, "loss": 5.2438, "loss/crossentropy": 1.983904242515564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30939212441444397, "step": 2542 }, { "epoch": 0.05088, "grad_norm": 2.703125, "grad_norm_var": 0.011864217122395833, "learning_rate": 0.0001, "loss": 4.9968, "loss/crossentropy": 2.2631462812423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27867285907268524, "step": 2544 }, { "epoch": 0.05092, "grad_norm": 3.078125, "grad_norm_var": 0.023856608072916667, "learning_rate": 0.0001, "loss": 4.9621, "loss/crossentropy": 1.9918989539146423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27703428268432617, "step": 2546 }, { "epoch": 0.05096, "grad_norm": 2.78125, "grad_norm_var": 0.021110026041666667, "learning_rate": 0.0001, "loss": 5.2041, "loss/crossentropy": 2.1356931924819946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27815964818000793, "step": 2548 }, { "epoch": 0.051, "grad_norm": 2.46875, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 4.6514, "loss/crossentropy": 2.172751545906067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2971910834312439, "step": 2550 }, { "epoch": 0.05104, "grad_norm": 2.578125, "grad_norm_var": 0.019527180989583334, "learning_rate": 0.0001, "loss": 5.0178, "loss/crossentropy": 2.0799094438552856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30459292232990265, "step": 2552 }, { "epoch": 0.05108, "grad_norm": 2.5, "grad_norm_var": 0.0211090087890625, "learning_rate": 0.0001, "loss": 4.8235, "loss/crossentropy": 1.7769129872322083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2526697665452957, "step": 2554 }, { "epoch": 0.05112, "grad_norm": 2.8125, "grad_norm_var": 0.024494425455729166, "learning_rate": 0.0001, "loss": 4.8457, "loss/crossentropy": 2.044790804386139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2821648418903351, "step": 2556 }, { "epoch": 0.05116, "grad_norm": 2.921875, "grad_norm_var": 0.0287109375, "learning_rate": 0.0001, "loss": 5.5336, "loss/crossentropy": 2.3708614110946655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30810464918613434, "step": 2558 }, { "epoch": 0.0512, "grad_norm": 2.71875, "grad_norm_var": 0.028709920247395833, "learning_rate": 0.0001, "loss": 5.2385, "loss/crossentropy": 2.2216718196868896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2919304668903351, "step": 2560 }, { "epoch": 0.05124, "grad_norm": 2.671875, "grad_norm_var": 0.019466145833333334, "learning_rate": 0.0001, "loss": 5.28, "loss/crossentropy": 2.4692097902297974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30620162189006805, "step": 2562 }, { "epoch": 0.05128, "grad_norm": 2.984375, "grad_norm_var": 0.026936848958333332, "learning_rate": 0.0001, "loss": 4.9476, "loss/crossentropy": 2.0491623282432556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25617313385009766, "step": 2564 }, { "epoch": 0.05132, "grad_norm": 2.8125, "grad_norm_var": 0.023368326822916667, "learning_rate": 0.0001, "loss": 4.958, "loss/crossentropy": 1.8305597305297852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25237561762332916, "step": 2566 }, { "epoch": 0.05136, "grad_norm": 2.640625, "grad_norm_var": 0.0223297119140625, "learning_rate": 0.0001, "loss": 4.9853, "loss/crossentropy": 1.9471853971481323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2712964415550232, "step": 2568 }, { "epoch": 0.0514, "grad_norm": 2.765625, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 5.0932, "loss/crossentropy": 2.575412631034851, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3106851130723953, "step": 2570 }, { "epoch": 0.05144, "grad_norm": 2.515625, "grad_norm_var": 0.020686848958333334, "learning_rate": 0.0001, "loss": 4.6755, "loss/crossentropy": 2.0210241079330444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.269680991768837, "step": 2572 }, { "epoch": 0.05148, "grad_norm": 2.4375, "grad_norm_var": 0.02076416015625, "learning_rate": 0.0001, "loss": 4.6308, "loss/crossentropy": 1.9054389595985413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24933087825775146, "step": 2574 }, { "epoch": 0.05152, "grad_norm": 2.625, "grad_norm_var": 0.021284993489583334, "learning_rate": 0.0001, "loss": 4.9682, "loss/crossentropy": 2.142069697380066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3162301778793335, "step": 2576 }, { "epoch": 0.05156, "grad_norm": 2.59375, "grad_norm_var": 0.0197662353515625, "learning_rate": 0.0001, "loss": 5.018, "loss/crossentropy": 1.9952309727668762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560836151242256, "step": 2578 }, { "epoch": 0.0516, "grad_norm": 2.765625, "grad_norm_var": 0.016405232747395835, "learning_rate": 0.0001, "loss": 4.8889, "loss/crossentropy": 2.0579317212104797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26214616745710373, "step": 2580 }, { "epoch": 0.05164, "grad_norm": 2.46875, "grad_norm_var": 0.016927083333333332, "learning_rate": 0.0001, "loss": 4.6306, "loss/crossentropy": 2.076499104499817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26244185864925385, "step": 2582 }, { "epoch": 0.05168, "grad_norm": 2.890625, "grad_norm_var": 0.026146443684895833, "learning_rate": 0.0001, "loss": 4.9393, "loss/crossentropy": 2.2277501821517944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32310059666633606, "step": 2584 }, { "epoch": 0.05172, "grad_norm": 2.53125, "grad_norm_var": 0.03141988118489583, "learning_rate": 0.0001, "loss": 5.0929, "loss/crossentropy": 2.101436138153076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26044395565986633, "step": 2586 }, { "epoch": 0.05176, "grad_norm": 2.703125, "grad_norm_var": 0.030939737955729168, "learning_rate": 0.0001, "loss": 5.045, "loss/crossentropy": 2.2617305517196655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2768043726682663, "step": 2588 }, { "epoch": 0.0518, "grad_norm": 2.515625, "grad_norm_var": 0.0349029541015625, "learning_rate": 0.0001, "loss": 4.989, "loss/crossentropy": 2.2669495940208435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2941794842481613, "step": 2590 }, { "epoch": 0.05184, "grad_norm": 2.546875, "grad_norm_var": 0.03871968587239583, "learning_rate": 0.0001, "loss": 4.7676, "loss/crossentropy": 1.8102391958236694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24766983091831207, "step": 2592 }, { "epoch": 0.05188, "grad_norm": 2.53125, "grad_norm_var": 0.04006754557291667, "learning_rate": 0.0001, "loss": 5.0022, "loss/crossentropy": 2.1426219940185547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2808763086795807, "step": 2594 }, { "epoch": 0.05192, "grad_norm": 2.75, "grad_norm_var": 0.0368804931640625, "learning_rate": 0.0001, "loss": 4.8649, "loss/crossentropy": 2.0731321573257446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28768520057201385, "step": 2596 }, { "epoch": 0.05196, "grad_norm": 2.46875, "grad_norm_var": 0.03619384765625, "learning_rate": 0.0001, "loss": 4.8019, "loss/crossentropy": 1.8902159333229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27209727466106415, "step": 2598 }, { "epoch": 0.052, "grad_norm": 2.390625, "grad_norm_var": 0.030855305989583335, "learning_rate": 0.0001, "loss": 4.837, "loss/crossentropy": 2.1860097646713257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3191404938697815, "step": 2600 }, { "epoch": 0.05204, "grad_norm": 2.78125, "grad_norm_var": 0.027074178059895832, "learning_rate": 0.0001, "loss": 4.8471, "loss/crossentropy": 2.1010658740997314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2924908995628357, "step": 2602 }, { "epoch": 0.05208, "grad_norm": 2.71875, "grad_norm_var": 0.0285064697265625, "learning_rate": 0.0001, "loss": 5.1725, "loss/crossentropy": 2.0668399930000305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2691944092512131, "step": 2604 }, { "epoch": 0.05212, "grad_norm": 2.8125, "grad_norm_var": 0.023824055989583332, "learning_rate": 0.0001, "loss": 5.0115, "loss/crossentropy": 2.310541272163391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31448885798454285, "step": 2606 }, { "epoch": 0.05216, "grad_norm": 2.75, "grad_norm_var": 0.0222808837890625, "learning_rate": 0.0001, "loss": 4.7731, "loss/crossentropy": 2.023577332496643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28528447449207306, "step": 2608 }, { "epoch": 0.0522, "grad_norm": 2.5625, "grad_norm_var": 0.0217193603515625, "learning_rate": 0.0001, "loss": 4.8408, "loss/crossentropy": 2.0232901573181152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25540125370025635, "step": 2610 }, { "epoch": 0.05224, "grad_norm": 2.5625, "grad_norm_var": 0.019554646809895833, "learning_rate": 0.0001, "loss": 4.8374, "loss/crossentropy": 2.147561550140381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2914447784423828, "step": 2612 }, { "epoch": 0.05228, "grad_norm": 2.46875, "grad_norm_var": 0.0193359375, "learning_rate": 0.0001, "loss": 4.8044, "loss/crossentropy": 1.9136184453964233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27879445254802704, "step": 2614 }, { "epoch": 0.05232, "grad_norm": 3.03125, "grad_norm_var": 0.0606842041015625, "learning_rate": 0.0001, "loss": 5.1245, "loss/crossentropy": 2.1118494272232056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2990800142288208, "step": 2616 }, { "epoch": 0.05236, "grad_norm": 2.640625, "grad_norm_var": 0.0599761962890625, "learning_rate": 0.0001, "loss": 5.2189, "loss/crossentropy": 1.9564325213432312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2805679142475128, "step": 2618 }, { "epoch": 0.0524, "grad_norm": 3.046875, "grad_norm_var": 0.06641337076822916, "learning_rate": 0.0001, "loss": 4.9511, "loss/crossentropy": 2.0683051347732544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2906472980976105, "step": 2620 }, { "epoch": 0.05244, "grad_norm": 2.609375, "grad_norm_var": 0.06653645833333334, "learning_rate": 0.0001, "loss": 5.058, "loss/crossentropy": 2.0510823130607605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28148986399173737, "step": 2622 }, { "epoch": 0.05248, "grad_norm": 2.640625, "grad_norm_var": 0.06256103515625, "learning_rate": 0.0001, "loss": 5.2113, "loss/crossentropy": 2.2972904443740845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32902073860168457, "step": 2624 }, { "epoch": 0.05252, "grad_norm": 2.890625, "grad_norm_var": 0.0582427978515625, "learning_rate": 0.0001, "loss": 5.0939, "loss/crossentropy": 1.9502179026603699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2844041585922241, "step": 2626 }, { "epoch": 0.05256, "grad_norm": 2.640625, "grad_norm_var": 0.05607808430989583, "learning_rate": 0.0001, "loss": 5.1078, "loss/crossentropy": 2.1577298045158386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2673380598425865, "step": 2628 }, { "epoch": 0.0526, "grad_norm": 2.59375, "grad_norm_var": 0.05271708170572917, "learning_rate": 0.0001, "loss": 5.0514, "loss/crossentropy": 2.1707664132118225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2697260081768036, "step": 2630 }, { "epoch": 0.05264, "grad_norm": 2.71875, "grad_norm_var": 0.017606608072916665, "learning_rate": 0.0001, "loss": 4.9617, "loss/crossentropy": 2.0975311398506165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27033862471580505, "step": 2632 }, { "epoch": 0.05268, "grad_norm": 3.25, "grad_norm_var": 0.033869425455729164, "learning_rate": 0.0001, "loss": 5.1841, "loss/crossentropy": 2.197197914123535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28717951476573944, "step": 2634 }, { "epoch": 0.05272, "grad_norm": 2.765625, "grad_norm_var": 0.03570556640625, "learning_rate": 0.0001, "loss": 5.2558, "loss/crossentropy": 2.2898266315460205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31359314918518066, "step": 2636 }, { "epoch": 0.05276, "grad_norm": 2.796875, "grad_norm_var": 0.0347564697265625, "learning_rate": 0.0001, "loss": 5.4849, "loss/crossentropy": 2.525710701942444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3199215829372406, "step": 2638 }, { "epoch": 0.0528, "grad_norm": 2.796875, "grad_norm_var": 0.03349609375, "learning_rate": 0.0001, "loss": 5.0495, "loss/crossentropy": 2.2799761295318604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28715676069259644, "step": 2640 }, { "epoch": 0.05284, "grad_norm": 2.796875, "grad_norm_var": 0.032373046875, "learning_rate": 0.0001, "loss": 4.9837, "loss/crossentropy": 1.8681190013885498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25435833632946014, "step": 2642 }, { "epoch": 0.05288, "grad_norm": 2.578125, "grad_norm_var": 0.03730061848958333, "learning_rate": 0.0001, "loss": 4.807, "loss/crossentropy": 1.9067540168762207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25444281101226807, "step": 2644 }, { "epoch": 0.05292, "grad_norm": 2.53125, "grad_norm_var": 0.03732096354166667, "learning_rate": 0.0001, "loss": 5.0326, "loss/crossentropy": 2.370971202850342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31173495948314667, "step": 2646 }, { "epoch": 0.05296, "grad_norm": 2.671875, "grad_norm_var": 0.03752848307291667, "learning_rate": 0.0001, "loss": 5.1148, "loss/crossentropy": 2.1829749941825867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29970400035381317, "step": 2648 }, { "epoch": 0.053, "grad_norm": 2.609375, "grad_norm_var": 0.020052083333333335, "learning_rate": 0.0001, "loss": 5.1086, "loss/crossentropy": 2.0818498134613037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27712464332580566, "step": 2650 }, { "epoch": 0.05304, "grad_norm": 2.609375, "grad_norm_var": 0.010789998372395833, "learning_rate": 0.0001, "loss": 5.1071, "loss/crossentropy": 2.2080377340316772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27552157640457153, "step": 2652 }, { "epoch": 0.05308, "grad_norm": 2.53125, "grad_norm_var": 0.01109619140625, "learning_rate": 0.0001, "loss": 4.9685, "loss/crossentropy": 1.7115904092788696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23266373574733734, "step": 2654 }, { "epoch": 0.05312, "grad_norm": 2.59375, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.8654, "loss/crossentropy": 2.1736810207366943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29696571826934814, "step": 2656 }, { "epoch": 0.05316, "grad_norm": 2.765625, "grad_norm_var": 0.012886555989583333, "learning_rate": 0.0001, "loss": 5.0834, "loss/crossentropy": 2.2366485595703125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3062315583229065, "step": 2658 }, { "epoch": 0.0532, "grad_norm": 2.609375, "grad_norm_var": 0.018748982747395834, "learning_rate": 0.0001, "loss": 5.0888, "loss/crossentropy": 1.9835070371627808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35663366317749023, "step": 2660 }, { "epoch": 0.05324, "grad_norm": 2.4375, "grad_norm_var": 0.026688639322916666, "learning_rate": 0.0001, "loss": 4.6725, "loss/crossentropy": 2.148723840713501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2687191218137741, "step": 2662 }, { "epoch": 0.05328, "grad_norm": 2.625, "grad_norm_var": 0.02789306640625, "learning_rate": 0.0001, "loss": 4.9191, "loss/crossentropy": 2.2642128467559814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2738381028175354, "step": 2664 }, { "epoch": 0.05332, "grad_norm": 2.546875, "grad_norm_var": 0.028483072916666668, "learning_rate": 0.0001, "loss": 4.8209, "loss/crossentropy": 1.839052438735962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2566695362329483, "step": 2666 }, { "epoch": 0.05336, "grad_norm": 2.6875, "grad_norm_var": 0.026927693684895834, "learning_rate": 0.0001, "loss": 4.8771, "loss/crossentropy": 2.083684980869293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2632910907268524, "step": 2668 }, { "epoch": 0.0534, "grad_norm": 2.75, "grad_norm_var": 0.028434244791666667, "learning_rate": 0.0001, "loss": 4.8585, "loss/crossentropy": 2.2125936150550842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31363190710544586, "step": 2670 }, { "epoch": 0.05344, "grad_norm": 2.734375, "grad_norm_var": 0.029157511393229165, "learning_rate": 0.0001, "loss": 5.0354, "loss/crossentropy": 2.2075263261795044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2850564122200012, "step": 2672 }, { "epoch": 0.05348, "grad_norm": 2.859375, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 5.0145, "loss/crossentropy": 2.0876463651657104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2972792685031891, "step": 2674 }, { "epoch": 0.05352, "grad_norm": 2.84375, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 5.2755, "loss/crossentropy": 2.4033172130584717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.304116889834404, "step": 2676 }, { "epoch": 0.05356, "grad_norm": 2.734375, "grad_norm_var": 0.014501953125, "learning_rate": 0.0001, "loss": 5.0921, "loss/crossentropy": 2.30586314201355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3264722675085068, "step": 2678 }, { "epoch": 0.0536, "grad_norm": 2.46875, "grad_norm_var": 0.015152994791666667, "learning_rate": 0.0001, "loss": 5.0941, "loss/crossentropy": 2.2175174951553345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2661540359258652, "step": 2680 }, { "epoch": 0.05364, "grad_norm": 2.921875, "grad_norm_var": 0.022606404622395833, "learning_rate": 0.0001, "loss": 5.1162, "loss/crossentropy": 2.0583900213241577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26995618641376495, "step": 2682 }, { "epoch": 0.05368, "grad_norm": 2.59375, "grad_norm_var": 0.020173136393229166, "learning_rate": 0.0001, "loss": 4.9357, "loss/crossentropy": 2.310309052467346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29642508924007416, "step": 2684 }, { "epoch": 0.05372, "grad_norm": 3.15625, "grad_norm_var": 0.03332926432291667, "learning_rate": 0.0001, "loss": 4.7945, "loss/crossentropy": 2.134134352207184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2619224041700363, "step": 2686 }, { "epoch": 0.05376, "grad_norm": 2.609375, "grad_norm_var": 0.03306884765625, "learning_rate": 0.0001, "loss": 4.8411, "loss/crossentropy": 1.930562138557434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27108173072338104, "step": 2688 }, { "epoch": 0.0538, "grad_norm": 2.46875, "grad_norm_var": 0.03455403645833333, "learning_rate": 0.0001, "loss": 4.9591, "loss/crossentropy": 2.3414769172668457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.270420178771019, "step": 2690 }, { "epoch": 0.05384, "grad_norm": 2.78125, "grad_norm_var": 0.0356842041015625, "learning_rate": 0.0001, "loss": 4.83, "loss/crossentropy": 2.1726938486099243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2710433751344681, "step": 2692 }, { "epoch": 0.05388, "grad_norm": 2.640625, "grad_norm_var": 0.0361968994140625, "learning_rate": 0.0001, "loss": 5.0163, "loss/crossentropy": 2.220117926597595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3212582617998123, "step": 2694 }, { "epoch": 0.05392, "grad_norm": 2.71875, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 5.1845, "loss/crossentropy": 2.2557668685913086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3096832036972046, "step": 2696 }, { "epoch": 0.05396, "grad_norm": 2.734375, "grad_norm_var": 0.0246246337890625, "learning_rate": 0.0001, "loss": 5.0399, "loss/crossentropy": 1.94975346326828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2530350238084793, "step": 2698 }, { "epoch": 0.054, "grad_norm": 2.6875, "grad_norm_var": 0.024169921875, "learning_rate": 0.0001, "loss": 5.19, "loss/crossentropy": 2.4622775316238403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3135389983654022, "step": 2700 }, { "epoch": 0.05404, "grad_norm": 2.65625, "grad_norm_var": 0.0110748291015625, "learning_rate": 0.0001, "loss": 5.2005, "loss/crossentropy": 2.5367363691329956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30742934346199036, "step": 2702 }, { "epoch": 0.05408, "grad_norm": 2.5, "grad_norm_var": 0.014631144205729167, "learning_rate": 0.0001, "loss": 5.146, "loss/crossentropy": 2.5733184814453125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33410580456256866, "step": 2704 }, { "epoch": 0.05412, "grad_norm": 2.6875, "grad_norm_var": 0.011725870768229167, "learning_rate": 0.0001, "loss": 4.8888, "loss/crossentropy": 1.9339997172355652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28760699927806854, "step": 2706 }, { "epoch": 0.05416, "grad_norm": 2.484375, "grad_norm_var": 0.010724894205729167, "learning_rate": 0.0001, "loss": 4.8719, "loss/crossentropy": 1.8515672087669373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23648252338171005, "step": 2708 }, { "epoch": 0.0542, "grad_norm": 2.546875, "grad_norm_var": 0.014420572916666667, "learning_rate": 0.0001, "loss": 4.6598, "loss/crossentropy": 2.0973429083824158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2605738639831543, "step": 2710 }, { "epoch": 0.05424, "grad_norm": 2.90625, "grad_norm_var": 0.021219889322916668, "learning_rate": 0.0001, "loss": 5.2795, "loss/crossentropy": 2.406570076942444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2992263287305832, "step": 2712 }, { "epoch": 0.05428, "grad_norm": 2.53125, "grad_norm_var": 0.024958292643229168, "learning_rate": 0.0001, "loss": 4.8591, "loss/crossentropy": 2.040315330028534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27726222574710846, "step": 2714 }, { "epoch": 0.05432, "grad_norm": 2.40625, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 4.7879, "loss/crossentropy": 2.250051259994507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2764698565006256, "step": 2716 }, { "epoch": 0.05436, "grad_norm": 2.4375, "grad_norm_var": 0.026985677083333333, "learning_rate": 0.0001, "loss": 4.8813, "loss/crossentropy": 2.25112247467041, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3101722151041031, "step": 2718 }, { "epoch": 0.0544, "grad_norm": 2.421875, "grad_norm_var": 0.028709920247395833, "learning_rate": 0.0001, "loss": 4.7242, "loss/crossentropy": 2.261968731880188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2805032432079315, "step": 2720 }, { "epoch": 0.05444, "grad_norm": 2.59375, "grad_norm_var": 0.030078125, "learning_rate": 0.0001, "loss": 5.0449, "loss/crossentropy": 2.376634955406189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25400323420763016, "step": 2722 }, { "epoch": 0.05448, "grad_norm": 2.515625, "grad_norm_var": 0.03351949055989583, "learning_rate": 0.0001, "loss": 5.2325, "loss/crossentropy": 2.61246657371521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3170415759086609, "step": 2724 }, { "epoch": 0.05452, "grad_norm": 2.59375, "grad_norm_var": 0.0296051025390625, "learning_rate": 0.0001, "loss": 5.0433, "loss/crossentropy": 2.3982752561569214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2730572074651718, "step": 2726 }, { "epoch": 0.05456, "grad_norm": 2.609375, "grad_norm_var": 0.0207916259765625, "learning_rate": 0.0001, "loss": 4.8836, "loss/crossentropy": 1.9890516996383667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26453813910484314, "step": 2728 }, { "epoch": 0.0546, "grad_norm": 2.4375, "grad_norm_var": 0.016559855143229166, "learning_rate": 0.0001, "loss": 4.7252, "loss/crossentropy": 2.1825047731399536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28369753062725067, "step": 2730 }, { "epoch": 0.05464, "grad_norm": 2.5, "grad_norm_var": 0.015360514322916666, "learning_rate": 0.0001, "loss": 4.9445, "loss/crossentropy": 1.9745987057685852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2662041634321213, "step": 2732 }, { "epoch": 0.05468, "grad_norm": 2.640625, "grad_norm_var": 0.020783487955729166, "learning_rate": 0.0001, "loss": 4.7591, "loss/crossentropy": 2.2962852716445923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2986691743135452, "step": 2734 }, { "epoch": 0.05472, "grad_norm": 2.65625, "grad_norm_var": 0.018342081705729166, "learning_rate": 0.0001, "loss": 4.9712, "loss/crossentropy": 2.0517550110816956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27293455600738525, "step": 2736 }, { "epoch": 0.05476, "grad_norm": 2.859375, "grad_norm_var": 0.021434529622395834, "learning_rate": 0.0001, "loss": 5.4052, "loss/crossentropy": 2.327734112739563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3019224554300308, "step": 2738 }, { "epoch": 0.0548, "grad_norm": 2.5, "grad_norm_var": 0.018680826822916666, "learning_rate": 0.0001, "loss": 4.7002, "loss/crossentropy": 2.236689567565918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2816064953804016, "step": 2740 }, { "epoch": 0.05484, "grad_norm": 2.75, "grad_norm_var": 0.020894368489583332, "learning_rate": 0.0001, "loss": 5.1321, "loss/crossentropy": 2.0209690928459167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2734442874789238, "step": 2742 }, { "epoch": 0.05488, "grad_norm": 2.640625, "grad_norm_var": 0.026276652018229166, "learning_rate": 0.0001, "loss": 4.9841, "loss/crossentropy": 2.2264864444732666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28523482382297516, "step": 2744 }, { "epoch": 0.05492, "grad_norm": 2.421875, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 4.8661, "loss/crossentropy": 1.9897980690002441, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26623376458883286, "step": 2746 }, { "epoch": 0.05496, "grad_norm": 2.53125, "grad_norm_var": 0.024442545572916665, "learning_rate": 0.0001, "loss": 4.9299, "loss/crossentropy": 2.0583779215812683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2803298681974411, "step": 2748 }, { "epoch": 0.055, "grad_norm": 2.515625, "grad_norm_var": 0.022001139322916665, "learning_rate": 0.0001, "loss": 4.9818, "loss/crossentropy": 1.8448269367218018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2606130689382553, "step": 2750 }, { "epoch": 0.05504, "grad_norm": 2.640625, "grad_norm_var": 0.02301025390625, "learning_rate": 0.0001, "loss": 5.0885, "loss/crossentropy": 2.294836401939392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29403699934482574, "step": 2752 }, { "epoch": 0.05508, "grad_norm": 2.296875, "grad_norm_var": 0.02808837890625, "learning_rate": 0.0001, "loss": 4.4637, "loss/crossentropy": 2.2402058839797974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28176650404930115, "step": 2754 }, { "epoch": 0.05512, "grad_norm": 2.734375, "grad_norm_var": 0.0298980712890625, "learning_rate": 0.0001, "loss": 4.7805, "loss/crossentropy": 1.7882421612739563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25275079905986786, "step": 2756 }, { "epoch": 0.05516, "grad_norm": 2.8125, "grad_norm_var": 0.02919921875, "learning_rate": 0.0001, "loss": 5.1277, "loss/crossentropy": 2.4185458421707153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30663780868053436, "step": 2758 }, { "epoch": 0.0552, "grad_norm": 2.484375, "grad_norm_var": 0.0318511962890625, "learning_rate": 0.0001, "loss": 4.9259, "loss/crossentropy": 2.2588642835617065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26785464584827423, "step": 2760 }, { "epoch": 0.05524, "grad_norm": 2.484375, "grad_norm_var": 0.034012858072916666, "learning_rate": 0.0001, "loss": 4.8359, "loss/crossentropy": 2.145754337310791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2710302472114563, "step": 2762 }, { "epoch": 0.05528, "grad_norm": 2.609375, "grad_norm_var": 0.03648681640625, "learning_rate": 0.0001, "loss": 4.739, "loss/crossentropy": 2.3627569675445557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2926081120967865, "step": 2764 }, { "epoch": 0.05532, "grad_norm": 2.609375, "grad_norm_var": 0.03632405598958333, "learning_rate": 0.0001, "loss": 4.8512, "loss/crossentropy": 1.988103210926056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24308288842439651, "step": 2766 }, { "epoch": 0.05536, "grad_norm": 2.828125, "grad_norm_var": 0.040445963541666664, "learning_rate": 0.0001, "loss": 5.1075, "loss/crossentropy": 2.2497689723968506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.313574954867363, "step": 2768 }, { "epoch": 0.0554, "grad_norm": 2.609375, "grad_norm_var": 0.030492146809895832, "learning_rate": 0.0001, "loss": 4.8978, "loss/crossentropy": 2.2603683471679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27997657656669617, "step": 2770 }, { "epoch": 0.05544, "grad_norm": 2.796875, "grad_norm_var": 0.030614217122395832, "learning_rate": 0.0001, "loss": 4.9485, "loss/crossentropy": 2.2585065364837646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2818940281867981, "step": 2772 }, { "epoch": 0.05548, "grad_norm": 2.578125, "grad_norm_var": 0.027730305989583332, "learning_rate": 0.0001, "loss": 5.2222, "loss/crossentropy": 2.1413429975509644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2950669527053833, "step": 2774 }, { "epoch": 0.05552, "grad_norm": 2.734375, "grad_norm_var": 0.0171875, "learning_rate": 0.0001, "loss": 5.1567, "loss/crossentropy": 1.9994583129882812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3946940451860428, "step": 2776 }, { "epoch": 0.05556, "grad_norm": 2.796875, "grad_norm_var": 0.014762369791666667, "learning_rate": 0.0001, "loss": 5.2877, "loss/crossentropy": 2.423824667930603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30817069113254547, "step": 2778 }, { "epoch": 0.0556, "grad_norm": 2.4375, "grad_norm_var": 0.013719685872395833, "learning_rate": 0.0001, "loss": 4.6657, "loss/crossentropy": 1.8579126000404358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24312064796686172, "step": 2780 }, { "epoch": 0.05564, "grad_norm": 2.390625, "grad_norm_var": 0.018485514322916667, "learning_rate": 0.0001, "loss": 4.7653, "loss/crossentropy": 2.3444939851760864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28466545045375824, "step": 2782 }, { "epoch": 0.05568, "grad_norm": 2.71875, "grad_norm_var": 0.03806864420572917, "learning_rate": 0.0001, "loss": 5.1187, "loss/crossentropy": 2.221144914627075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2694521099328995, "step": 2784 }, { "epoch": 0.05572, "grad_norm": 2.578125, "grad_norm_var": 0.0376861572265625, "learning_rate": 0.0001, "loss": 5.0401, "loss/crossentropy": 1.9372909665107727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2698594778776169, "step": 2786 }, { "epoch": 0.05576, "grad_norm": 2.765625, "grad_norm_var": 0.038834635416666666, "learning_rate": 0.0001, "loss": 4.834, "loss/crossentropy": 2.129204750061035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27333614230155945, "step": 2788 }, { "epoch": 0.0558, "grad_norm": 2.5, "grad_norm_var": 0.04045817057291667, "learning_rate": 0.0001, "loss": 4.8462, "loss/crossentropy": 1.6917370557785034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24290545284748077, "step": 2790 }, { "epoch": 0.05584, "grad_norm": 2.765625, "grad_norm_var": 0.04096577962239583, "learning_rate": 0.0001, "loss": 4.6942, "loss/crossentropy": 1.7883749604225159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2554834187030792, "step": 2792 }, { "epoch": 0.05588, "grad_norm": 2.765625, "grad_norm_var": 0.04560445149739583, "learning_rate": 0.0001, "loss": 4.6835, "loss/crossentropy": 1.867136001586914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2667583078145981, "step": 2794 }, { "epoch": 0.05592, "grad_norm": 2.546875, "grad_norm_var": 0.04143473307291667, "learning_rate": 0.0001, "loss": 4.9686, "loss/crossentropy": 2.032800853252411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2580890506505966, "step": 2796 }, { "epoch": 0.05596, "grad_norm": 3.03125, "grad_norm_var": 0.04389546712239583, "learning_rate": 0.0001, "loss": 5.018, "loss/crossentropy": 1.9867863655090332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2694792151451111, "step": 2798 }, { "epoch": 0.056, "grad_norm": 2.59375, "grad_norm_var": 0.026786295572916667, "learning_rate": 0.0001, "loss": 4.9137, "loss/crossentropy": 2.1026757955551147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2664051130414009, "step": 2800 }, { "epoch": 0.05604, "grad_norm": 2.65625, "grad_norm_var": 0.0259918212890625, "learning_rate": 0.0001, "loss": 4.9883, "loss/crossentropy": 2.0649160742759705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2473129704594612, "step": 2802 }, { "epoch": 0.05608, "grad_norm": 2.921875, "grad_norm_var": 0.030451456705729168, "learning_rate": 0.0001, "loss": 4.8959, "loss/crossentropy": 2.2110248804092407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27131715416908264, "step": 2804 }, { "epoch": 0.05612, "grad_norm": 2.65625, "grad_norm_var": 0.032933553059895836, "learning_rate": 0.0001, "loss": 4.9603, "loss/crossentropy": 2.3483060598373413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27071496844291687, "step": 2806 }, { "epoch": 0.05616, "grad_norm": 2.640625, "grad_norm_var": 0.03277587890625, "learning_rate": 0.0001, "loss": 4.9273, "loss/crossentropy": 1.9061944484710693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2550426125526428, "step": 2808 }, { "epoch": 0.0562, "grad_norm": 2.671875, "grad_norm_var": 0.0278472900390625, "learning_rate": 0.0001, "loss": 5.1734, "loss/crossentropy": 2.3073103427886963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30051568150520325, "step": 2810 }, { "epoch": 0.05624, "grad_norm": 2.671875, "grad_norm_var": 0.027106730143229167, "learning_rate": 0.0001, "loss": 5.0268, "loss/crossentropy": 2.393891453742981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29226459562778473, "step": 2812 }, { "epoch": 0.05628, "grad_norm": 2.484375, "grad_norm_var": 0.018578084309895833, "learning_rate": 0.0001, "loss": 5.0839, "loss/crossentropy": 2.3082761764526367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29539754986763, "step": 2814 }, { "epoch": 0.05632, "grad_norm": 2.515625, "grad_norm_var": 0.021776326497395835, "learning_rate": 0.0001, "loss": 4.8973, "loss/crossentropy": 2.7815494537353516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33258646726608276, "step": 2816 }, { "epoch": 0.05636, "grad_norm": 2.390625, "grad_norm_var": 0.031538899739583334, "learning_rate": 0.0001, "loss": 4.7095, "loss/crossentropy": 2.077984571456909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.261022225022316, "step": 2818 }, { "epoch": 0.0564, "grad_norm": 2.484375, "grad_norm_var": 0.0251617431640625, "learning_rate": 0.0001, "loss": 4.6389, "loss/crossentropy": 2.0524688363075256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2936979830265045, "step": 2820 }, { "epoch": 0.05644, "grad_norm": 2.5, "grad_norm_var": 0.018212890625, "learning_rate": 0.0001, "loss": 4.9657, "loss/crossentropy": 1.8323140740394592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26426824927330017, "step": 2822 }, { "epoch": 0.05648, "grad_norm": 2.765625, "grad_norm_var": 0.0201171875, "learning_rate": 0.0001, "loss": 4.9513, "loss/crossentropy": 2.2290462255477905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28852027654647827, "step": 2824 }, { "epoch": 0.05652, "grad_norm": 2.5625, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 4.7828, "loss/crossentropy": 1.8788678050041199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24312467128038406, "step": 2826 }, { "epoch": 0.05656, "grad_norm": 2.46875, "grad_norm_var": 0.01646728515625, "learning_rate": 0.0001, "loss": 4.7882, "loss/crossentropy": 1.9402090311050415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25435876101255417, "step": 2828 }, { "epoch": 0.0566, "grad_norm": 2.390625, "grad_norm_var": 0.0144683837890625, "learning_rate": 0.0001, "loss": 4.5823, "loss/crossentropy": 2.4833847284317017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27234241366386414, "step": 2830 }, { "epoch": 0.05664, "grad_norm": 2.515625, "grad_norm_var": 0.0154296875, "learning_rate": 0.0001, "loss": 5.0521, "loss/crossentropy": 2.351561665534973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28028184175491333, "step": 2832 }, { "epoch": 0.05668, "grad_norm": 2.953125, "grad_norm_var": 0.019722493489583333, "learning_rate": 0.0001, "loss": 5.0688, "loss/crossentropy": 2.0372352600097656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26005299389362335, "step": 2834 }, { "epoch": 0.05672, "grad_norm": 2.828125, "grad_norm_var": 0.021744791666666666, "learning_rate": 0.0001, "loss": 4.4282, "loss/crossentropy": 1.6522082090377808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26683834940195084, "step": 2836 }, { "epoch": 0.05676, "grad_norm": 2.390625, "grad_norm_var": 0.023778279622395832, "learning_rate": 0.0001, "loss": 4.7165, "loss/crossentropy": 1.8886643052101135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23464814573526382, "step": 2838 }, { "epoch": 0.0568, "grad_norm": 2.734375, "grad_norm_var": 0.023119099934895835, "learning_rate": 0.0001, "loss": 4.9477, "loss/crossentropy": 2.2161459922790527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29381541907787323, "step": 2840 }, { "epoch": 0.05684, "grad_norm": 2.421875, "grad_norm_var": 0.024779256184895834, "learning_rate": 0.0001, "loss": 4.7408, "loss/crossentropy": 2.1542043685913086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2753119468688965, "step": 2842 }, { "epoch": 0.05688, "grad_norm": 2.78125, "grad_norm_var": 0.026090494791666665, "learning_rate": 0.0001, "loss": 5.1288, "loss/crossentropy": 2.5605628490448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.329460546374321, "step": 2844 }, { "epoch": 0.05692, "grad_norm": 2.609375, "grad_norm_var": 0.023224894205729166, "learning_rate": 0.0001, "loss": 4.9955, "loss/crossentropy": 2.1000319719314575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.273986279964447, "step": 2846 }, { "epoch": 0.05696, "grad_norm": 2.484375, "grad_norm_var": 0.023558553059895834, "learning_rate": 0.0001, "loss": 4.6813, "loss/crossentropy": 2.1036760210990906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26293954253196716, "step": 2848 }, { "epoch": 0.057, "grad_norm": 2.640625, "grad_norm_var": 0.014286295572916666, "learning_rate": 0.0001, "loss": 4.9799, "loss/crossentropy": 2.2130608558654785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2977828085422516, "step": 2850 }, { "epoch": 0.05704, "grad_norm": 2.578125, "grad_norm_var": 0.01461181640625, "learning_rate": 0.0001, "loss": 5.1761, "loss/crossentropy": 2.1878823041915894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27322643995285034, "step": 2852 }, { "epoch": 0.05708, "grad_norm": 2.703125, "grad_norm_var": 0.012547810872395834, "learning_rate": 0.0001, "loss": 4.9448, "loss/crossentropy": 2.1559258699417114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32625503838062286, "step": 2854 }, { "epoch": 0.05712, "grad_norm": 2.390625, "grad_norm_var": 0.015848795572916668, "learning_rate": 0.0001, "loss": 4.4611, "loss/crossentropy": 2.0818992257118225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2677721679210663, "step": 2856 }, { "epoch": 0.05716, "grad_norm": 2.578125, "grad_norm_var": 0.016243489583333333, "learning_rate": 0.0001, "loss": 5.2755, "loss/crossentropy": 2.544907331466675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30028045177459717, "step": 2858 }, { "epoch": 0.0572, "grad_norm": 3.203125, "grad_norm_var": 0.04491780598958333, "learning_rate": 0.0001, "loss": 5.2855, "loss/crossentropy": 2.5932188034057617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30431173741817474, "step": 2860 }, { "epoch": 0.05724, "grad_norm": 2.453125, "grad_norm_var": 0.046219889322916666, "learning_rate": 0.0001, "loss": 4.9799, "loss/crossentropy": 2.1007986068725586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2989690601825714, "step": 2862 }, { "epoch": 0.05728, "grad_norm": 2.5625, "grad_norm_var": 0.044169108072916664, "learning_rate": 0.0001, "loss": 4.7706, "loss/crossentropy": 2.0102875232696533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26936179399490356, "step": 2864 }, { "epoch": 0.05732, "grad_norm": 3.109375, "grad_norm_var": 0.056559244791666664, "learning_rate": 0.0001, "loss": 5.2099, "loss/crossentropy": 2.3457159996032715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29194609820842743, "step": 2866 }, { "epoch": 0.05736, "grad_norm": 2.625, "grad_norm_var": 0.055582682291666664, "learning_rate": 0.0001, "loss": 4.8768, "loss/crossentropy": 2.559054732322693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3018783777952194, "step": 2868 }, { "epoch": 0.0574, "grad_norm": 2.703125, "grad_norm_var": 0.05831705729166667, "learning_rate": 0.0001, "loss": 4.9863, "loss/crossentropy": 2.0641059279441833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2789234220981598, "step": 2870 }, { "epoch": 0.05744, "grad_norm": 2.734375, "grad_norm_var": 0.04683837890625, "learning_rate": 0.0001, "loss": 4.9222, "loss/crossentropy": 2.0819749236106873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26825186610221863, "step": 2872 }, { "epoch": 0.05748, "grad_norm": 2.703125, "grad_norm_var": 0.046263631184895834, "learning_rate": 0.0001, "loss": 4.9586, "loss/crossentropy": 2.1114020347595215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2613547742366791, "step": 2874 }, { "epoch": 0.05752, "grad_norm": 2.4375, "grad_norm_var": 0.02916259765625, "learning_rate": 0.0001, "loss": 4.9474, "loss/crossentropy": 2.174915075302124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29503974318504333, "step": 2876 }, { "epoch": 0.05756, "grad_norm": 2.6875, "grad_norm_var": 0.0265289306640625, "learning_rate": 0.0001, "loss": 5.0541, "loss/crossentropy": 2.4342113733291626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30623678863048553, "step": 2878 }, { "epoch": 0.0576, "grad_norm": 2.703125, "grad_norm_var": 0.024918619791666666, "learning_rate": 0.0001, "loss": 5.0316, "loss/crossentropy": 2.0518307089805603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28422991931438446, "step": 2880 }, { "epoch": 0.05764, "grad_norm": 2.6875, "grad_norm_var": 0.011921183268229166, "learning_rate": 0.0001, "loss": 5.0806, "loss/crossentropy": 2.5378612279891968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29730531573295593, "step": 2882 }, { "epoch": 0.05768, "grad_norm": 2.5, "grad_norm_var": 0.014972941080729166, "learning_rate": 0.0001, "loss": 4.8685, "loss/crossentropy": 2.2670027017593384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26347288489341736, "step": 2884 }, { "epoch": 0.05772, "grad_norm": 3.671875, "grad_norm_var": 0.07822265625, "learning_rate": 0.0001, "loss": 5.0385, "loss/crossentropy": 2.351823568344116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3020637035369873, "step": 2886 }, { "epoch": 0.05776, "grad_norm": 2.65625, "grad_norm_var": 0.0792633056640625, "learning_rate": 0.0001, "loss": 4.9699, "loss/crossentropy": 2.190839111804962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2639586254954338, "step": 2888 }, { "epoch": 0.0578, "grad_norm": 2.3125, "grad_norm_var": 0.08765869140625, "learning_rate": 0.0001, "loss": 4.6882, "loss/crossentropy": 2.148400902748108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27762140333652496, "step": 2890 }, { "epoch": 0.05784, "grad_norm": 2.984375, "grad_norm_var": 0.09696858723958333, "learning_rate": 0.0001, "loss": 5.1033, "loss/crossentropy": 2.01130074262619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2678648605942726, "step": 2892 }, { "epoch": 0.05788, "grad_norm": 2.75, "grad_norm_var": 0.09986572265625, "learning_rate": 0.0001, "loss": 5.0829, "loss/crossentropy": 2.2239269018173218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2971164286136627, "step": 2894 }, { "epoch": 0.05792, "grad_norm": 2.328125, "grad_norm_var": 0.10974934895833334, "learning_rate": 0.0001, "loss": 4.48, "loss/crossentropy": 1.9573850631713867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.241651751101017, "step": 2896 }, { "epoch": 0.05796, "grad_norm": 2.5, "grad_norm_var": 0.11169331868489583, "learning_rate": 0.0001, "loss": 4.8806, "loss/crossentropy": 1.9522782564163208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2685594707727432, "step": 2898 }, { "epoch": 0.058, "grad_norm": 2.671875, "grad_norm_var": 0.10886128743489583, "learning_rate": 0.0001, "loss": 4.9335, "loss/crossentropy": 2.1742069721221924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27867090702056885, "step": 2900 }, { "epoch": 0.05804, "grad_norm": 5.15625, "grad_norm_var": 0.44882405598958336, "learning_rate": 0.0001, "loss": 4.9164, "loss/crossentropy": 2.201782703399658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2731190174818039, "step": 2902 }, { "epoch": 0.05808, "grad_norm": 2.734375, "grad_norm_var": 0.4452301025390625, "learning_rate": 0.0001, "loss": 4.9787, "loss/crossentropy": 2.159709095954895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2940017879009247, "step": 2904 }, { "epoch": 0.05812, "grad_norm": 2.71875, "grad_norm_var": 0.4297271728515625, "learning_rate": 0.0001, "loss": 4.9192, "loss/crossentropy": 2.1989885568618774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2776087671518326, "step": 2906 }, { "epoch": 0.05816, "grad_norm": 2.546875, "grad_norm_var": 0.42688395182291666, "learning_rate": 0.0001, "loss": 4.8851, "loss/crossentropy": 1.9634575247764587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27917809784412384, "step": 2908 }, { "epoch": 0.0582, "grad_norm": 2.390625, "grad_norm_var": 0.43651936848958334, "learning_rate": 0.0001, "loss": 4.7776, "loss/crossentropy": 2.1553479433059692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28876082599163055, "step": 2910 }, { "epoch": 0.05824, "grad_norm": 2.515625, "grad_norm_var": 0.42451883951822916, "learning_rate": 0.0001, "loss": 4.881, "loss/crossentropy": 2.1035598516464233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2851293236017227, "step": 2912 }, { "epoch": 0.05828, "grad_norm": 2.890625, "grad_norm_var": 0.42097880045572916, "learning_rate": 0.0001, "loss": 5.2733, "loss/crossentropy": 2.17076575756073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28258734941482544, "step": 2914 }, { "epoch": 0.05832, "grad_norm": 2.484375, "grad_norm_var": 0.42477213541666664, "learning_rate": 0.0001, "loss": 4.9147, "loss/crossentropy": 2.2914711236953735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29451698064804077, "step": 2916 }, { "epoch": 0.05836, "grad_norm": 2.703125, "grad_norm_var": 0.021410115559895835, "learning_rate": 0.0001, "loss": 5.1395, "loss/crossentropy": 2.554638981819153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30992935597896576, "step": 2918 }, { "epoch": 0.0584, "grad_norm": 2.546875, "grad_norm_var": 0.05735575358072917, "learning_rate": 0.0001, "loss": 4.6262, "loss/crossentropy": 1.808376431465149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22574126720428467, "step": 2920 }, { "epoch": 0.05844, "grad_norm": 2.78125, "grad_norm_var": 0.05660400390625, "learning_rate": 0.0001, "loss": 5.1636, "loss/crossentropy": 2.1471784114837646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27325738966464996, "step": 2922 }, { "epoch": 0.05848, "grad_norm": 2.765625, "grad_norm_var": 0.054182942708333334, "learning_rate": 0.0001, "loss": 5.071, "loss/crossentropy": 2.2175731658935547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3013547956943512, "step": 2924 }, { "epoch": 0.05852, "grad_norm": 2.953125, "grad_norm_var": 0.04949544270833333, "learning_rate": 0.0001, "loss": 5.467, "loss/crossentropy": 2.369373917579651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29249751567840576, "step": 2926 }, { "epoch": 0.05856, "grad_norm": 2.453125, "grad_norm_var": 0.054915364583333334, "learning_rate": 0.0001, "loss": 4.8778, "loss/crossentropy": 2.1758522987365723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28846102952957153, "step": 2928 }, { "epoch": 0.0586, "grad_norm": 2.546875, "grad_norm_var": 0.05413411458333333, "learning_rate": 0.0001, "loss": 4.929, "loss/crossentropy": 2.46218478679657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2824682295322418, "step": 2930 }, { "epoch": 0.05864, "grad_norm": 2.40625, "grad_norm_var": 0.0606597900390625, "learning_rate": 0.0001, "loss": 4.5557, "loss/crossentropy": 2.058937907218933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26767465472221375, "step": 2932 }, { "epoch": 0.05868, "grad_norm": 2.546875, "grad_norm_var": 0.06109619140625, "learning_rate": 0.0001, "loss": 4.9103, "loss/crossentropy": 2.312442421913147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2952658236026764, "step": 2934 }, { "epoch": 0.05872, "grad_norm": 2.71875, "grad_norm_var": 0.026911417643229168, "learning_rate": 0.0001, "loss": 5.1452, "loss/crossentropy": 2.18839955329895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31788623332977295, "step": 2936 }, { "epoch": 0.05876, "grad_norm": 2.671875, "grad_norm_var": 0.025537109375, "learning_rate": 0.0001, "loss": 5.1358, "loss/crossentropy": 2.1330811977386475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28829698264598846, "step": 2938 }, { "epoch": 0.0588, "grad_norm": 2.9375, "grad_norm_var": 0.0294586181640625, "learning_rate": 0.0001, "loss": 5.1071, "loss/crossentropy": 2.124038338661194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2708826810121536, "step": 2940 }, { "epoch": 0.05884, "grad_norm": 2.53125, "grad_norm_var": 0.024312337239583332, "learning_rate": 0.0001, "loss": 4.8479, "loss/crossentropy": 2.1934449076652527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27314090728759766, "step": 2942 }, { "epoch": 0.05888, "grad_norm": 2.6875, "grad_norm_var": 0.024535115559895834, "learning_rate": 0.0001, "loss": 4.9904, "loss/crossentropy": 1.967636525630951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23761005699634552, "step": 2944 }, { "epoch": 0.05892, "grad_norm": 3.015625, "grad_norm_var": 0.034821573893229166, "learning_rate": 0.0001, "loss": 4.9214, "loss/crossentropy": 2.2380826473236084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3402934819459915, "step": 2946 }, { "epoch": 0.05896, "grad_norm": 2.5625, "grad_norm_var": 0.0299468994140625, "learning_rate": 0.0001, "loss": 4.7702, "loss/crossentropy": 2.227039933204651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27934837341308594, "step": 2948 }, { "epoch": 0.059, "grad_norm": 2.6875, "grad_norm_var": 0.03258056640625, "learning_rate": 0.0001, "loss": 4.789, "loss/crossentropy": 1.999170958995819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.275806725025177, "step": 2950 }, { "epoch": 0.05904, "grad_norm": 2.546875, "grad_norm_var": 0.03241780598958333, "learning_rate": 0.0001, "loss": 4.5754, "loss/crossentropy": 1.8843520879745483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26203446090221405, "step": 2952 }, { "epoch": 0.05908, "grad_norm": 2.5, "grad_norm_var": 0.033426920572916664, "learning_rate": 0.0001, "loss": 4.5485, "loss/crossentropy": 2.0742241740226746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28777699172496796, "step": 2954 }, { "epoch": 0.05912, "grad_norm": 2.6875, "grad_norm_var": 0.028206380208333333, "learning_rate": 0.0001, "loss": 4.9676, "loss/crossentropy": 2.2272751331329346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27761510014533997, "step": 2956 }, { "epoch": 0.05916, "grad_norm": 2.734375, "grad_norm_var": 0.029878743489583335, "learning_rate": 0.0001, "loss": 4.6329, "loss/crossentropy": 2.2758638858795166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28144824504852295, "step": 2958 }, { "epoch": 0.0592, "grad_norm": 2.46875, "grad_norm_var": 0.060530598958333334, "learning_rate": 0.0001, "loss": 4.9543, "loss/crossentropy": 2.245158016681671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3030128926038742, "step": 2960 }, { "epoch": 0.05924, "grad_norm": 2.453125, "grad_norm_var": 0.053498331705729166, "learning_rate": 0.0001, "loss": 4.567, "loss/crossentropy": 2.3337708711624146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29228493571281433, "step": 2962 }, { "epoch": 0.05928, "grad_norm": 2.5, "grad_norm_var": 0.05788472493489583, "learning_rate": 0.0001, "loss": 4.6477, "loss/crossentropy": 2.3765406608581543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2715871036052704, "step": 2964 }, { "epoch": 0.05932, "grad_norm": 2.375, "grad_norm_var": 0.0562652587890625, "learning_rate": 0.0001, "loss": 4.8649, "loss/crossentropy": 2.090362787246704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505037933588028, "step": 2966 }, { "epoch": 0.05936, "grad_norm": 2.453125, "grad_norm_var": 0.056396484375, "learning_rate": 0.0001, "loss": 4.8669, "loss/crossentropy": 2.011539399623871, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27219071984291077, "step": 2968 }, { "epoch": 0.0594, "grad_norm": 2.734375, "grad_norm_var": 0.05693359375, "learning_rate": 0.0001, "loss": 5.2081, "loss/crossentropy": 2.1791563034057617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2796429842710495, "step": 2970 }, { "epoch": 0.05944, "grad_norm": 2.65625, "grad_norm_var": 0.05869852701822917, "learning_rate": 0.0001, "loss": 5.2689, "loss/crossentropy": 2.4645297527313232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3106658458709717, "step": 2972 }, { "epoch": 0.05948, "grad_norm": 7.0, "grad_norm_var": 1.25670166015625, "learning_rate": 0.0001, "loss": 5.0715, "loss/crossentropy": 2.2050880193710327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26860976219177246, "step": 2974 }, { "epoch": 0.05952, "grad_norm": 2.828125, "grad_norm_var": 1.236034138997396, "learning_rate": 0.0001, "loss": 4.5815, "loss/crossentropy": 2.0141921639442444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27980539202690125, "step": 2976 }, { "epoch": 0.05956, "grad_norm": 4.4375, "grad_norm_var": 1.3665924072265625, "learning_rate": 0.0001, "loss": 4.8067, "loss/crossentropy": 1.9399088025093079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26379524916410446, "step": 2978 }, { "epoch": 0.0596, "grad_norm": 2.796875, "grad_norm_var": 1.3243886311848958, "learning_rate": 0.0001, "loss": 5.1743, "loss/crossentropy": 2.5415157079696655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.291620597243309, "step": 2980 }, { "epoch": 0.05964, "grad_norm": 2.5625, "grad_norm_var": 1.3170562744140626, "learning_rate": 0.0001, "loss": 5.0695, "loss/crossentropy": 2.1215542554855347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26189403235912323, "step": 2982 }, { "epoch": 0.05968, "grad_norm": 2.578125, "grad_norm_var": 1.32301025390625, "learning_rate": 0.0001, "loss": 5.0049, "loss/crossentropy": 1.7749422788619995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27546317875385284, "step": 2984 }, { "epoch": 0.05972, "grad_norm": 2.59375, "grad_norm_var": 1.3371490478515624, "learning_rate": 0.0001, "loss": 4.8331, "loss/crossentropy": 2.3383208513259888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2790983319282532, "step": 2986 }, { "epoch": 0.05976, "grad_norm": 2.5625, "grad_norm_var": 1.35230712890625, "learning_rate": 0.0001, "loss": 4.8656, "loss/crossentropy": 2.3688780069351196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2752939611673355, "step": 2988 }, { "epoch": 0.0598, "grad_norm": 2.6875, "grad_norm_var": 0.22967020670572916, "learning_rate": 0.0001, "loss": 4.7299, "loss/crossentropy": 2.261468529701233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2725592106580734, "step": 2990 }, { "epoch": 0.05984, "grad_norm": 2.671875, "grad_norm_var": 0.2226226806640625, "learning_rate": 0.0001, "loss": 4.7417, "loss/crossentropy": 2.0632028579711914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2617062032222748, "step": 2992 }, { "epoch": 0.05988, "grad_norm": 2.25, "grad_norm_var": 0.022151692708333334, "learning_rate": 0.0001, "loss": 4.4282, "loss/crossentropy": 2.1409813165664673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2657589614391327, "step": 2994 }, { "epoch": 0.05992, "grad_norm": 2.921875, "grad_norm_var": 0.0258941650390625, "learning_rate": 0.0001, "loss": 5.1816, "loss/crossentropy": 2.431404948234558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3087555915117264, "step": 2996 }, { "epoch": 0.05996, "grad_norm": 2.390625, "grad_norm_var": 0.029743448893229166, "learning_rate": 0.0001, "loss": 4.8253, "loss/crossentropy": 2.165238618850708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29382775723934174, "step": 2998 }, { "epoch": 0.06, "grad_norm": 2.546875, "grad_norm_var": 0.034566243489583336, "learning_rate": 0.0001, "loss": 4.6328, "loss/crossentropy": 1.9987847208976746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24466252326965332, "step": 3000 }, { "epoch": 0.06004, "grad_norm": 2.203125, "grad_norm_var": 0.04084370930989583, "learning_rate": 0.0001, "loss": 4.5474, "loss/crossentropy": 2.1153565049171448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2435970976948738, "step": 3002 }, { "epoch": 0.06008, "grad_norm": 2.3125, "grad_norm_var": 0.046402994791666666, "learning_rate": 0.0001, "loss": 4.7765, "loss/crossentropy": 1.8660866618156433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24413185566663742, "step": 3004 }, { "epoch": 0.06012, "grad_norm": 2.40625, "grad_norm_var": 0.04309794108072917, "learning_rate": 0.0001, "loss": 4.7476, "loss/crossentropy": 2.1816678047180176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28010787069797516, "step": 3006 }, { "epoch": 0.06016, "grad_norm": 2.53125, "grad_norm_var": 0.040827433268229164, "learning_rate": 0.0001, "loss": 4.5995, "loss/crossentropy": 2.1673622131347656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2659531533718109, "step": 3008 }, { "epoch": 0.0602, "grad_norm": 2.78125, "grad_norm_var": 0.040913899739583336, "learning_rate": 0.0001, "loss": 5.2466, "loss/crossentropy": 2.277818202972412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2929365783929825, "step": 3010 }, { "epoch": 0.06024, "grad_norm": 2.65625, "grad_norm_var": 0.0284576416015625, "learning_rate": 0.0001, "loss": 4.8904, "loss/crossentropy": 2.294926404953003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2753874659538269, "step": 3012 }, { "epoch": 0.06028, "grad_norm": 2.359375, "grad_norm_var": 0.028173828125, "learning_rate": 0.0001, "loss": 4.735, "loss/crossentropy": 2.0222257375717163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27295801043510437, "step": 3014 }, { "epoch": 0.06032, "grad_norm": 2.484375, "grad_norm_var": 0.023628743489583333, "learning_rate": 0.0001, "loss": 4.818, "loss/crossentropy": 2.43736469745636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2934436649084091, "step": 3016 }, { "epoch": 0.06036, "grad_norm": 2.703125, "grad_norm_var": 0.020426432291666668, "learning_rate": 0.0001, "loss": 5.0053, "loss/crossentropy": 2.027767241001129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26353244483470917, "step": 3018 }, { "epoch": 0.0604, "grad_norm": 2.59375, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 4.9059, "loss/crossentropy": 2.150290071964264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27290941774845123, "step": 3020 }, { "epoch": 0.06044, "grad_norm": 2.421875, "grad_norm_var": 0.014697265625, "learning_rate": 0.0001, "loss": 4.8307, "loss/crossentropy": 1.8709848523139954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25272610783576965, "step": 3022 }, { "epoch": 0.06048, "grad_norm": 2.46875, "grad_norm_var": 0.015949503580729166, "learning_rate": 0.0001, "loss": 5.0166, "loss/crossentropy": 2.2170883417129517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29401010274887085, "step": 3024 }, { "epoch": 0.06052, "grad_norm": 2.53125, "grad_norm_var": 0.0109375, "learning_rate": 0.0001, "loss": 4.6221, "loss/crossentropy": 1.728318691253662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24246910959482193, "step": 3026 }, { "epoch": 0.06056, "grad_norm": 2.34375, "grad_norm_var": 0.0095123291015625, "learning_rate": 0.0001, "loss": 4.5597, "loss/crossentropy": 1.8453290462493896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24357211589813232, "step": 3028 }, { "epoch": 0.0606, "grad_norm": 2.640625, "grad_norm_var": 0.010456339518229166, "learning_rate": 0.0001, "loss": 4.6617, "loss/crossentropy": 2.273196220397949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29239149391651154, "step": 3030 }, { "epoch": 0.06064, "grad_norm": 2.484375, "grad_norm_var": 0.012165323893229166, "learning_rate": 0.0001, "loss": 4.9464, "loss/crossentropy": 2.031871259212494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2604901194572449, "step": 3032 }, { "epoch": 0.06068, "grad_norm": 2.515625, "grad_norm_var": 0.008915201822916666, "learning_rate": 0.0001, "loss": 5.1618, "loss/crossentropy": 2.1781771183013916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2853371948003769, "step": 3034 }, { "epoch": 0.06072, "grad_norm": 2.484375, "grad_norm_var": 0.006590779622395833, "learning_rate": 0.0001, "loss": 4.816, "loss/crossentropy": 1.9578353762626648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23352904617786407, "step": 3036 }, { "epoch": 0.06076, "grad_norm": 2.671875, "grad_norm_var": 0.012516276041666666, "learning_rate": 0.0001, "loss": 5.3012, "loss/crossentropy": 2.470985770225525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28890977799892426, "step": 3038 }, { "epoch": 0.0608, "grad_norm": 2.65625, "grad_norm_var": 0.013361612955729166, "learning_rate": 0.0001, "loss": 4.9243, "loss/crossentropy": 2.056231141090393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2963344603776932, "step": 3040 }, { "epoch": 0.06084, "grad_norm": 2.609375, "grad_norm_var": 0.013646443684895834, "learning_rate": 0.0001, "loss": 4.8612, "loss/crossentropy": 2.000037968158722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27152783423662186, "step": 3042 }, { "epoch": 0.06088, "grad_norm": 2.59375, "grad_norm_var": 0.0111480712890625, "learning_rate": 0.0001, "loss": 4.8576, "loss/crossentropy": 2.323809027671814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29995349049568176, "step": 3044 }, { "epoch": 0.06092, "grad_norm": 2.4375, "grad_norm_var": 0.010407511393229167, "learning_rate": 0.0001, "loss": 5.0094, "loss/crossentropy": 2.06933856010437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26831966638565063, "step": 3046 }, { "epoch": 0.06096, "grad_norm": 2.53125, "grad_norm_var": 0.008463541666666666, "learning_rate": 0.0001, "loss": 5.042, "loss/crossentropy": 2.1903880834579468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2833031117916107, "step": 3048 }, { "epoch": 0.061, "grad_norm": 2.46875, "grad_norm_var": 0.008893839518229167, "learning_rate": 0.0001, "loss": 4.9557, "loss/crossentropy": 1.910680890083313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2708408683538437, "step": 3050 }, { "epoch": 0.06104, "grad_norm": 2.546875, "grad_norm_var": 0.0083404541015625, "learning_rate": 0.0001, "loss": 4.8902, "loss/crossentropy": 2.5203059911727905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2981104403734207, "step": 3052 }, { "epoch": 0.06108, "grad_norm": 2.5, "grad_norm_var": 0.00611572265625, "learning_rate": 0.0001, "loss": 4.68, "loss/crossentropy": 1.7918179035186768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2462354451417923, "step": 3054 }, { "epoch": 0.06112, "grad_norm": 2.65625, "grad_norm_var": 0.006441243489583333, "learning_rate": 0.0001, "loss": 4.9908, "loss/crossentropy": 2.185121774673462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3046490252017975, "step": 3056 }, { "epoch": 0.06116, "grad_norm": 2.28125, "grad_norm_var": 0.01064453125, "learning_rate": 0.0001, "loss": 4.8041, "loss/crossentropy": 1.859390914440155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2523205131292343, "step": 3058 }, { "epoch": 0.0612, "grad_norm": 2.40625, "grad_norm_var": 0.011930338541666667, "learning_rate": 0.0001, "loss": 4.7388, "loss/crossentropy": 1.9202255606651306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26971636712551117, "step": 3060 }, { "epoch": 0.06124, "grad_norm": 2.578125, "grad_norm_var": 0.011481730143229167, "learning_rate": 0.0001, "loss": 4.7421, "loss/crossentropy": 2.059127449989319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25721821188926697, "step": 3062 }, { "epoch": 0.06128, "grad_norm": 2.6875, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 5.1392, "loss/crossentropy": 2.3587669134140015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30162671208381653, "step": 3064 }, { "epoch": 0.06132, "grad_norm": 2.59375, "grad_norm_var": 0.018050130208333334, "learning_rate": 0.0001, "loss": 5.2748, "loss/crossentropy": 2.239704966545105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2713697552680969, "step": 3066 }, { "epoch": 0.06136, "grad_norm": 2.796875, "grad_norm_var": 0.02076416015625, "learning_rate": 0.0001, "loss": 5.0534, "loss/crossentropy": 2.230944514274597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27418144047260284, "step": 3068 }, { "epoch": 0.0614, "grad_norm": 2.515625, "grad_norm_var": 0.020563761393229168, "learning_rate": 0.0001, "loss": 4.9145, "loss/crossentropy": 2.196586310863495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24946682900190353, "step": 3070 }, { "epoch": 0.06144, "grad_norm": 2.65625, "grad_norm_var": 0.019873046875, "learning_rate": 0.0001, "loss": 5.058, "loss/crossentropy": 2.115275800228119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2668849602341652, "step": 3072 }, { "epoch": 0.06148, "grad_norm": 2.765625, "grad_norm_var": 0.014286295572916666, "learning_rate": 0.0001, "loss": 5.1311, "loss/crossentropy": 2.0227994322776794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2587117701768875, "step": 3074 }, { "epoch": 0.06152, "grad_norm": 2.421875, "grad_norm_var": 0.016022745768229166, "learning_rate": 0.0001, "loss": 4.711, "loss/crossentropy": 2.231368660926819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2621329501271248, "step": 3076 }, { "epoch": 0.06156, "grad_norm": 2.421875, "grad_norm_var": 0.0190093994140625, "learning_rate": 0.0001, "loss": 4.6707, "loss/crossentropy": 2.469847083091736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2758704572916031, "step": 3078 }, { "epoch": 0.0616, "grad_norm": 2.5, "grad_norm_var": 0.01666259765625, "learning_rate": 0.0001, "loss": 4.5753, "loss/crossentropy": 2.257850766181946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28196004033088684, "step": 3080 }, { "epoch": 0.06164, "grad_norm": 2.46875, "grad_norm_var": 0.014969889322916667, "learning_rate": 0.0001, "loss": 4.8783, "loss/crossentropy": 2.52754008769989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.301498681306839, "step": 3082 }, { "epoch": 0.06168, "grad_norm": 2.484375, "grad_norm_var": 0.012279256184895834, "learning_rate": 0.0001, "loss": 4.6595, "loss/crossentropy": 2.120129644870758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30073782801628113, "step": 3084 }, { "epoch": 0.06172, "grad_norm": 2.546875, "grad_norm_var": 0.0231353759765625, "learning_rate": 0.0001, "loss": 5.017, "loss/crossentropy": 2.1483632922172546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29400819540023804, "step": 3086 }, { "epoch": 0.06176, "grad_norm": 2.59375, "grad_norm_var": 0.022297159830729166, "learning_rate": 0.0001, "loss": 5.2447, "loss/crossentropy": 2.5588048696517944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3099561035633087, "step": 3088 }, { "epoch": 0.0618, "grad_norm": 2.359375, "grad_norm_var": 0.019852701822916666, "learning_rate": 0.0001, "loss": 4.5507, "loss/crossentropy": 2.338167190551758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2638262137770653, "step": 3090 }, { "epoch": 0.06184, "grad_norm": 2.5, "grad_norm_var": 0.019603474934895834, "learning_rate": 0.0001, "loss": 5.0471, "loss/crossentropy": 2.61361825466156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29420773684978485, "step": 3092 }, { "epoch": 0.06188, "grad_norm": 2.5625, "grad_norm_var": 0.019108072916666666, "learning_rate": 0.0001, "loss": 4.6931, "loss/crossentropy": 2.1078842878341675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27283959090709686, "step": 3094 }, { "epoch": 0.06192, "grad_norm": 2.296875, "grad_norm_var": 0.023958333333333335, "learning_rate": 0.0001, "loss": 4.463, "loss/crossentropy": 2.1501123905181885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3052368611097336, "step": 3096 }, { "epoch": 0.06196, "grad_norm": 2.671875, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 5.0482, "loss/crossentropy": 2.3622714281082153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2851791977882385, "step": 3098 }, { "epoch": 0.062, "grad_norm": 2.671875, "grad_norm_var": 0.05203450520833333, "learning_rate": 0.0001, "loss": 4.3506, "loss/crossentropy": 1.9757406115531921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24451570957899094, "step": 3100 }, { "epoch": 0.06204, "grad_norm": 2.859375, "grad_norm_var": 0.07011311848958333, "learning_rate": 0.0001, "loss": 5.0409, "loss/crossentropy": 2.1915838718414307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2749434858560562, "step": 3102 }, { "epoch": 0.06208, "grad_norm": 2.78125, "grad_norm_var": 0.07214253743489583, "learning_rate": 0.0001, "loss": 4.8408, "loss/crossentropy": 2.130649447441101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2590179592370987, "step": 3104 }, { "epoch": 0.06212, "grad_norm": 2.59375, "grad_norm_var": 0.06317952473958334, "learning_rate": 0.0001, "loss": 4.9083, "loss/crossentropy": 2.4478825330734253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30854369699954987, "step": 3106 }, { "epoch": 0.06216, "grad_norm": 2.578125, "grad_norm_var": 0.06524149576822917, "learning_rate": 0.0001, "loss": 4.8249, "loss/crossentropy": 1.7108858227729797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22502756118774414, "step": 3108 }, { "epoch": 0.0622, "grad_norm": 2.46875, "grad_norm_var": 0.06788736979166667, "learning_rate": 0.0001, "loss": 4.5355, "loss/crossentropy": 1.79804128408432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24767793715000153, "step": 3110 }, { "epoch": 0.06224, "grad_norm": 3.28125, "grad_norm_var": 0.07480061848958333, "learning_rate": 0.0001, "loss": 4.8779, "loss/crossentropy": 1.899846076965332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23476070165634155, "step": 3112 }, { "epoch": 0.06228, "grad_norm": 2.453125, "grad_norm_var": 0.0790679931640625, "learning_rate": 0.0001, "loss": 4.7506, "loss/crossentropy": 1.8458155393600464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24417708814144135, "step": 3114 }, { "epoch": 0.06232, "grad_norm": 2.5, "grad_norm_var": 0.06979166666666667, "learning_rate": 0.0001, "loss": 4.814, "loss/crossentropy": 1.947302520275116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2566901594400406, "step": 3116 }, { "epoch": 0.06236, "grad_norm": 2.546875, "grad_norm_var": 0.046923828125, "learning_rate": 0.0001, "loss": 4.8424, "loss/crossentropy": 2.039161205291748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26955537497997284, "step": 3118 }, { "epoch": 0.0624, "grad_norm": 2.65625, "grad_norm_var": 0.04488016764322917, "learning_rate": 0.0001, "loss": 5.1656, "loss/crossentropy": 2.1528661251068115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.291052982211113, "step": 3120 }, { "epoch": 0.06244, "grad_norm": 2.640625, "grad_norm_var": 0.04644775390625, "learning_rate": 0.0001, "loss": 4.7941, "loss/crossentropy": 2.4113996028900146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2962254136800766, "step": 3122 }, { "epoch": 0.06248, "grad_norm": 2.65625, "grad_norm_var": 0.044722493489583334, "learning_rate": 0.0001, "loss": 4.8463, "loss/crossentropy": 2.0718825459480286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24676042795181274, "step": 3124 }, { "epoch": 0.06252, "grad_norm": 2.546875, "grad_norm_var": 0.04442952473958333, "learning_rate": 0.0001, "loss": 4.9723, "loss/crossentropy": 2.433539032936096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.263284370303154, "step": 3126 }, { "epoch": 0.06256, "grad_norm": 2.421875, "grad_norm_var": 0.0087799072265625, "learning_rate": 0.0001, "loss": 4.7688, "loss/crossentropy": 2.047150671482086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655043303966522, "step": 3128 }, { "epoch": 0.0626, "grad_norm": 2.546875, "grad_norm_var": 0.008219401041666666, "learning_rate": 0.0001, "loss": 4.9257, "loss/crossentropy": 1.8897674679756165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2314881831407547, "step": 3130 }, { "epoch": 0.06264, "grad_norm": 2.390625, "grad_norm_var": 0.00797119140625, "learning_rate": 0.0001, "loss": 4.8053, "loss/crossentropy": 2.1436809301376343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26931750774383545, "step": 3132 }, { "epoch": 0.06268, "grad_norm": 2.421875, "grad_norm_var": 0.0081695556640625, "learning_rate": 0.0001, "loss": 4.67, "loss/crossentropy": 1.9773973226547241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26880529522895813, "step": 3134 }, { "epoch": 0.06272, "grad_norm": 2.984375, "grad_norm_var": 0.021903483072916667, "learning_rate": 0.0001, "loss": 5.1236, "loss/crossentropy": 2.208973228931427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29979653656482697, "step": 3136 }, { "epoch": 0.06276, "grad_norm": 2.8125, "grad_norm_var": 0.026676432291666666, "learning_rate": 0.0001, "loss": 5.0619, "loss/crossentropy": 2.64120090007782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29249200224876404, "step": 3138 }, { "epoch": 0.0628, "grad_norm": 2.703125, "grad_norm_var": 0.0306549072265625, "learning_rate": 0.0001, "loss": 5.0152, "loss/crossentropy": 2.293095588684082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2689897269010544, "step": 3140 }, { "epoch": 0.06284, "grad_norm": 2.5625, "grad_norm_var": 0.028758748372395834, "learning_rate": 0.0001, "loss": 4.71, "loss/crossentropy": 2.455227494239807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3202812373638153, "step": 3142 }, { "epoch": 0.06288, "grad_norm": 2.578125, "grad_norm_var": 0.0253814697265625, "learning_rate": 0.0001, "loss": 4.9622, "loss/crossentropy": 1.9667487740516663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2702512592077255, "step": 3144 }, { "epoch": 0.06292, "grad_norm": 2.578125, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 4.7756, "loss/crossentropy": 1.8181490898132324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2310035452246666, "step": 3146 }, { "epoch": 0.06296, "grad_norm": 2.640625, "grad_norm_var": 0.0373687744140625, "learning_rate": 0.0001, "loss": 4.9556, "loss/crossentropy": 1.9985284805297852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2944463640451431, "step": 3148 }, { "epoch": 0.063, "grad_norm": 2.28125, "grad_norm_var": 0.04142964680989583, "learning_rate": 0.0001, "loss": 5.0316, "loss/crossentropy": 2.3850419521331787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29605095088481903, "step": 3150 }, { "epoch": 0.06304, "grad_norm": 4.125, "grad_norm_var": 0.17021077473958332, "learning_rate": 0.0001, "loss": 5.2062, "loss/crossentropy": 2.3815460205078125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30247962474823, "step": 3152 }, { "epoch": 0.06308, "grad_norm": 3.421875, "grad_norm_var": 0.19840087890625, "learning_rate": 0.0001, "loss": 4.5851, "loss/crossentropy": 1.995104193687439, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2506560683250427, "step": 3154 }, { "epoch": 0.06312, "grad_norm": 3.328125, "grad_norm_var": 0.2305084228515625, "learning_rate": 0.0001, "loss": 4.5885, "loss/crossentropy": 2.2037696838378906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2716705799102783, "step": 3156 }, { "epoch": 0.06316, "grad_norm": 2.84375, "grad_norm_var": 0.23944905598958333, "learning_rate": 0.0001, "loss": 4.7155, "loss/crossentropy": 2.067265272140503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25495442003011703, "step": 3158 }, { "epoch": 0.0632, "grad_norm": 2.78125, "grad_norm_var": 0.23763020833333334, "learning_rate": 0.0001, "loss": 5.1219, "loss/crossentropy": 2.227355480194092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27113111317157745, "step": 3160 }, { "epoch": 0.06324, "grad_norm": 2.546875, "grad_norm_var": 0.23921610514322916, "learning_rate": 0.0001, "loss": 4.6777, "loss/crossentropy": 1.9077441096305847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2630976662039757, "step": 3162 }, { "epoch": 0.06328, "grad_norm": 2.59375, "grad_norm_var": 0.23515523274739583, "learning_rate": 0.0001, "loss": 5.0084, "loss/crossentropy": 2.1737552881240845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28365984559059143, "step": 3164 }, { "epoch": 0.06332, "grad_norm": 2.5625, "grad_norm_var": 0.22542317708333334, "learning_rate": 0.0001, "loss": 4.9822, "loss/crossentropy": 1.9357402920722961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27245980501174927, "step": 3166 }, { "epoch": 0.06336, "grad_norm": 2.390625, "grad_norm_var": 0.0995269775390625, "learning_rate": 0.0001, "loss": 4.7681, "loss/crossentropy": 1.8484191298484802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2933400571346283, "step": 3168 }, { "epoch": 0.0634, "grad_norm": 2.609375, "grad_norm_var": 0.057738240559895834, "learning_rate": 0.0001, "loss": 5.0719, "loss/crossentropy": 2.3085306882858276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2926720678806305, "step": 3170 }, { "epoch": 0.06344, "grad_norm": 2.78125, "grad_norm_var": 0.040380859375, "learning_rate": 0.0001, "loss": 5.1464, "loss/crossentropy": 2.080895185470581, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2638559564948082, "step": 3172 }, { "epoch": 0.06348, "grad_norm": 2.53125, "grad_norm_var": 0.03287353515625, "learning_rate": 0.0001, "loss": 4.6857, "loss/crossentropy": 2.035506248474121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2692716419696808, "step": 3174 }, { "epoch": 0.06352, "grad_norm": 2.859375, "grad_norm_var": 0.035252888997395836, "learning_rate": 0.0001, "loss": 4.9479, "loss/crossentropy": 2.1712740659713745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26579485833644867, "step": 3176 }, { "epoch": 0.06356, "grad_norm": 2.96875, "grad_norm_var": 0.04252827962239583, "learning_rate": 0.0001, "loss": 5.1654, "loss/crossentropy": 2.2256147861480713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27605514228343964, "step": 3178 }, { "epoch": 0.0636, "grad_norm": 2.359375, "grad_norm_var": 0.046507771809895834, "learning_rate": 0.0001, "loss": 4.6489, "loss/crossentropy": 2.0917118191719055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560323476791382, "step": 3180 }, { "epoch": 0.06364, "grad_norm": 2.671875, "grad_norm_var": 0.045491536458333336, "learning_rate": 0.0001, "loss": 4.9833, "loss/crossentropy": 2.3109938502311707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27040477097034454, "step": 3182 }, { "epoch": 0.06368, "grad_norm": 2.65625, "grad_norm_var": 0.03984273274739583, "learning_rate": 0.0001, "loss": 5.1441, "loss/crossentropy": 2.3505672812461853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29180124402046204, "step": 3184 }, { "epoch": 0.06372, "grad_norm": 2.578125, "grad_norm_var": 0.04014383951822917, "learning_rate": 0.0001, "loss": 5.0224, "loss/crossentropy": 2.2471169233322144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2676118314266205, "step": 3186 }, { "epoch": 0.06376, "grad_norm": 2.46875, "grad_norm_var": 0.0264312744140625, "learning_rate": 0.0001, "loss": 4.8247, "loss/crossentropy": 2.3312125205993652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29243919253349304, "step": 3188 }, { "epoch": 0.0638, "grad_norm": 2.4375, "grad_norm_var": 0.02681884765625, "learning_rate": 0.0001, "loss": 4.9374, "loss/crossentropy": 2.1709930896759033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2786664366722107, "step": 3190 }, { "epoch": 0.06384, "grad_norm": 2.609375, "grad_norm_var": 0.02476806640625, "learning_rate": 0.0001, "loss": 4.7391, "loss/crossentropy": 1.8477665185928345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26132869720458984, "step": 3192 }, { "epoch": 0.06388, "grad_norm": 2.6875, "grad_norm_var": 0.01724853515625, "learning_rate": 0.0001, "loss": 4.8167, "loss/crossentropy": 2.2102121114730835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26027651876211166, "step": 3194 }, { "epoch": 0.06392, "grad_norm": 2.640625, "grad_norm_var": 0.0168365478515625, "learning_rate": 0.0001, "loss": 4.6381, "loss/crossentropy": 2.0011088252067566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25629256665706635, "step": 3196 }, { "epoch": 0.06396, "grad_norm": 2.625, "grad_norm_var": 0.01685791015625, "learning_rate": 0.0001, "loss": 4.9215, "loss/crossentropy": 2.319412350654602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28315384685993195, "step": 3198 }, { "epoch": 0.064, "grad_norm": 2.328125, "grad_norm_var": 0.0145172119140625, "learning_rate": 0.0001, "loss": 4.7936, "loss/crossentropy": 2.192026138305664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.264492504298687, "step": 3200 }, { "epoch": 0.06404, "grad_norm": 2.78125, "grad_norm_var": 0.017024739583333334, "learning_rate": 0.0001, "loss": 4.8178, "loss/crossentropy": 2.1745734214782715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29369065165519714, "step": 3202 }, { "epoch": 0.06408, "grad_norm": 3.109375, "grad_norm_var": 0.042313639322916666, "learning_rate": 0.0001, "loss": 5.577, "loss/crossentropy": 2.5039013624191284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835303544998169, "step": 3204 }, { "epoch": 0.06412, "grad_norm": 2.53125, "grad_norm_var": 0.04075419108072917, "learning_rate": 0.0001, "loss": 5.0308, "loss/crossentropy": 2.4255706071853638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3014884740114212, "step": 3206 }, { "epoch": 0.06416, "grad_norm": 2.5625, "grad_norm_var": 0.03873291015625, "learning_rate": 0.0001, "loss": 5.002, "loss/crossentropy": 2.266150116920471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2698349952697754, "step": 3208 }, { "epoch": 0.0642, "grad_norm": 2.4375, "grad_norm_var": 0.03805338541666667, "learning_rate": 0.0001, "loss": 5.0173, "loss/crossentropy": 2.2042946815490723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28830648958683014, "step": 3210 }, { "epoch": 0.06424, "grad_norm": 2.5625, "grad_norm_var": 0.03578999837239583, "learning_rate": 0.0001, "loss": 5.0292, "loss/crossentropy": 2.4034690856933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2834039032459259, "step": 3212 }, { "epoch": 0.06428, "grad_norm": 2.4375, "grad_norm_var": 0.0444000244140625, "learning_rate": 0.0001, "loss": 4.6655, "loss/crossentropy": 2.1347755193710327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26405099034309387, "step": 3214 }, { "epoch": 0.06432, "grad_norm": 3.34375, "grad_norm_var": 0.07415262858072917, "learning_rate": 0.0001, "loss": 5.0862, "loss/crossentropy": 2.018262207508087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27144598215818405, "step": 3216 }, { "epoch": 0.06436, "grad_norm": 2.46875, "grad_norm_var": 0.076953125, "learning_rate": 0.0001, "loss": 4.3969, "loss/crossentropy": 2.0254003405570984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24976836144924164, "step": 3218 }, { "epoch": 0.0644, "grad_norm": 2.78125, "grad_norm_var": 0.06026102701822917, "learning_rate": 0.0001, "loss": 4.6779, "loss/crossentropy": 2.1952659487724304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2786559462547302, "step": 3220 }, { "epoch": 0.06444, "grad_norm": 2.609375, "grad_norm_var": 0.06024983723958333, "learning_rate": 0.0001, "loss": 4.8828, "loss/crossentropy": 2.1383036375045776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2570757120847702, "step": 3222 }, { "epoch": 0.06448, "grad_norm": 2.65625, "grad_norm_var": 0.05915425618489583, "learning_rate": 0.0001, "loss": 4.8248, "loss/crossentropy": 2.267812967300415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2774328589439392, "step": 3224 }, { "epoch": 0.06452, "grad_norm": 2.515625, "grad_norm_var": 0.0587066650390625, "learning_rate": 0.0001, "loss": 4.9389, "loss/crossentropy": 1.906205415725708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2528962790966034, "step": 3226 }, { "epoch": 0.06456, "grad_norm": 2.453125, "grad_norm_var": 0.05895182291666667, "learning_rate": 0.0001, "loss": 4.9823, "loss/crossentropy": 2.1628336906433105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27087944746017456, "step": 3228 }, { "epoch": 0.0646, "grad_norm": 2.390625, "grad_norm_var": 0.05807291666666667, "learning_rate": 0.0001, "loss": 4.5893, "loss/crossentropy": 1.8845162391662598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2241554707288742, "step": 3230 }, { "epoch": 0.06464, "grad_norm": 2.390625, "grad_norm_var": 0.019237263997395834, "learning_rate": 0.0001, "loss": 4.9143, "loss/crossentropy": 2.4157146215438843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2770439088344574, "step": 3232 }, { "epoch": 0.06468, "grad_norm": 2.515625, "grad_norm_var": 0.021825154622395832, "learning_rate": 0.0001, "loss": 4.832, "loss/crossentropy": 1.9979816675186157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23696578294038773, "step": 3234 }, { "epoch": 0.06472, "grad_norm": 2.484375, "grad_norm_var": 0.016364542643229167, "learning_rate": 0.0001, "loss": 5.1123, "loss/crossentropy": 2.1790190935134888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.252600260078907, "step": 3236 }, { "epoch": 0.06476, "grad_norm": 2.4375, "grad_norm_var": 0.016624959309895833, "learning_rate": 0.0001, "loss": 4.5943, "loss/crossentropy": 2.1612111926078796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27929094433784485, "step": 3238 }, { "epoch": 0.0648, "grad_norm": 2.546875, "grad_norm_var": 0.08124898274739584, "learning_rate": 0.0001, "loss": 4.8293, "loss/crossentropy": 2.261025071144104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3034510314464569, "step": 3240 }, { "epoch": 0.06484, "grad_norm": 2.46875, "grad_norm_var": 0.081689453125, "learning_rate": 0.0001, "loss": 4.8243, "loss/crossentropy": 2.02247554063797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655429244041443, "step": 3242 }, { "epoch": 0.06488, "grad_norm": 2.390625, "grad_norm_var": 0.08289286295572916, "learning_rate": 0.0001, "loss": 4.9428, "loss/crossentropy": 2.495269775390625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32837189733982086, "step": 3244 }, { "epoch": 0.06492, "grad_norm": 2.53125, "grad_norm_var": 0.07550455729166666, "learning_rate": 0.0001, "loss": 4.9928, "loss/crossentropy": 2.365166425704956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27000221610069275, "step": 3246 }, { "epoch": 0.06496, "grad_norm": 2.4375, "grad_norm_var": 0.0746246337890625, "learning_rate": 0.0001, "loss": 4.837, "loss/crossentropy": 1.8728906512260437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2711311876773834, "step": 3248 }, { "epoch": 0.065, "grad_norm": 2.375, "grad_norm_var": 0.07427978515625, "learning_rate": 0.0001, "loss": 4.7916, "loss/crossentropy": 1.916531264781952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23036544770002365, "step": 3250 }, { "epoch": 0.06504, "grad_norm": 2.421875, "grad_norm_var": 0.07463277180989583, "learning_rate": 0.0001, "loss": 4.9399, "loss/crossentropy": 1.996503233909607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24317056685686111, "step": 3252 }, { "epoch": 0.06508, "grad_norm": 2.5625, "grad_norm_var": 0.07366536458333334, "learning_rate": 0.0001, "loss": 4.8746, "loss/crossentropy": 2.101921260356903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775810658931732, "step": 3254 }, { "epoch": 0.06512, "grad_norm": 2.40625, "grad_norm_var": 0.005125935872395833, "learning_rate": 0.0001, "loss": 4.7526, "loss/crossentropy": 1.9286046028137207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23964455723762512, "step": 3256 }, { "epoch": 0.06516, "grad_norm": 2.40625, "grad_norm_var": 0.005464680989583333, "learning_rate": 0.0001, "loss": 4.8216, "loss/crossentropy": 2.224915862083435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27516382932662964, "step": 3258 }, { "epoch": 0.0652, "grad_norm": 2.34375, "grad_norm_var": 0.0133697509765625, "learning_rate": 0.0001, "loss": 4.6553, "loss/crossentropy": 2.259337306022644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27185751497745514, "step": 3260 }, { "epoch": 0.06524, "grad_norm": 2.484375, "grad_norm_var": 0.013117472330729166, "learning_rate": 0.0001, "loss": 4.7095, "loss/crossentropy": 2.1081044673919678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23740330338478088, "step": 3262 }, { "epoch": 0.06528, "grad_norm": 2.546875, "grad_norm_var": 0.0131744384765625, "learning_rate": 0.0001, "loss": 4.6375, "loss/crossentropy": 1.9032058119773865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2508246824145317, "step": 3264 }, { "epoch": 0.06532, "grad_norm": 2.671875, "grad_norm_var": 0.013353474934895833, "learning_rate": 0.0001, "loss": 4.7165, "loss/crossentropy": 2.0773792266845703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2797544598579407, "step": 3266 }, { "epoch": 0.06536, "grad_norm": 2.40625, "grad_norm_var": 0.013509114583333334, "learning_rate": 0.0001, "loss": 4.9136, "loss/crossentropy": 2.1591526865959167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2540442571043968, "step": 3268 }, { "epoch": 0.0654, "grad_norm": 2.546875, "grad_norm_var": 0.014351399739583333, "learning_rate": 0.0001, "loss": 4.6248, "loss/crossentropy": 1.7735809683799744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24107928574085236, "step": 3270 }, { "epoch": 0.06544, "grad_norm": 2.5, "grad_norm_var": 0.014647420247395833, "learning_rate": 0.0001, "loss": 4.7568, "loss/crossentropy": 2.069553792476654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26356737315654755, "step": 3272 }, { "epoch": 0.06548, "grad_norm": 2.453125, "grad_norm_var": 0.014623006184895834, "learning_rate": 0.0001, "loss": 4.7274, "loss/crossentropy": 1.9688642024993896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29453104734420776, "step": 3274 }, { "epoch": 0.06552, "grad_norm": 2.296875, "grad_norm_var": 0.012043253580729166, "learning_rate": 0.0001, "loss": 4.4264, "loss/crossentropy": 1.785252034664154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23213861882686615, "step": 3276 }, { "epoch": 0.06556, "grad_norm": 2.65625, "grad_norm_var": 0.013255818684895834, "learning_rate": 0.0001, "loss": 4.779, "loss/crossentropy": 2.155774712562561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25758183747529984, "step": 3278 }, { "epoch": 0.0656, "grad_norm": 2.5625, "grad_norm_var": 0.016161092122395835, "learning_rate": 0.0001, "loss": 5.0453, "loss/crossentropy": 2.0403348803520203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25007252395153046, "step": 3280 }, { "epoch": 0.06564, "grad_norm": 2.4375, "grad_norm_var": 0.015208943684895834, "learning_rate": 0.0001, "loss": 4.7161, "loss/crossentropy": 1.9963608384132385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2608266994357109, "step": 3282 }, { "epoch": 0.06568, "grad_norm": 2.75, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 4.9275, "loss/crossentropy": 2.2021098732948303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28361018002033234, "step": 3284 }, { "epoch": 0.06572, "grad_norm": 2.59375, "grad_norm_var": 0.01812744140625, "learning_rate": 0.0001, "loss": 4.6533, "loss/crossentropy": 2.0363903641700745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2641526162624359, "step": 3286 }, { "epoch": 0.06576, "grad_norm": 2.671875, "grad_norm_var": 0.020466105143229166, "learning_rate": 0.0001, "loss": 4.6884, "loss/crossentropy": 2.1052531003952026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24344927817583084, "step": 3288 }, { "epoch": 0.0658, "grad_norm": 2.578125, "grad_norm_var": 0.02027587890625, "learning_rate": 0.0001, "loss": 4.9246, "loss/crossentropy": 2.0664029717445374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25679293274879456, "step": 3290 }, { "epoch": 0.06584, "grad_norm": 2.625, "grad_norm_var": 0.013678995768229167, "learning_rate": 0.0001, "loss": 4.7802, "loss/crossentropy": 2.168351709842682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2759328931570053, "step": 3292 }, { "epoch": 0.06588, "grad_norm": 2.515625, "grad_norm_var": 0.013703409830729167, "learning_rate": 0.0001, "loss": 4.8481, "loss/crossentropy": 1.9305949211120605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26165173947811127, "step": 3294 }, { "epoch": 0.06592, "grad_norm": 2.4375, "grad_norm_var": 0.017476399739583332, "learning_rate": 0.0001, "loss": 4.6942, "loss/crossentropy": 1.9257155060768127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24499841034412384, "step": 3296 }, { "epoch": 0.06596, "grad_norm": 2.484375, "grad_norm_var": 0.017513020833333334, "learning_rate": 0.0001, "loss": 4.8941, "loss/crossentropy": 2.1406426429748535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26676414906978607, "step": 3298 }, { "epoch": 0.066, "grad_norm": 2.734375, "grad_norm_var": 0.016422526041666666, "learning_rate": 0.0001, "loss": 4.8293, "loss/crossentropy": 2.099781036376953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2916935384273529, "step": 3300 }, { "epoch": 0.06604, "grad_norm": 2.875, "grad_norm_var": 0.023094685872395833, "learning_rate": 0.0001, "loss": 4.9504, "loss/crossentropy": 2.1077913641929626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2667912393808365, "step": 3302 }, { "epoch": 0.06608, "grad_norm": 3.90625, "grad_norm_var": 0.1297271728515625, "learning_rate": 0.0001, "loss": 5.2952, "loss/crossentropy": 2.2583223581314087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2907385528087616, "step": 3304 }, { "epoch": 0.06612, "grad_norm": 2.71875, "grad_norm_var": 0.13405659993489583, "learning_rate": 0.0001, "loss": 4.5813, "loss/crossentropy": 1.847477912902832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2272188812494278, "step": 3306 }, { "epoch": 0.06616, "grad_norm": 2.40625, "grad_norm_var": 0.1392486572265625, "learning_rate": 0.0001, "loss": 4.6502, "loss/crossentropy": 1.7610225677490234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23973071575164795, "step": 3308 }, { "epoch": 0.0662, "grad_norm": 2.328125, "grad_norm_var": 0.1464019775390625, "learning_rate": 0.0001, "loss": 4.7123, "loss/crossentropy": 1.977916419506073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25456882268190384, "step": 3310 }, { "epoch": 0.06624, "grad_norm": 2.734375, "grad_norm_var": 0.14095052083333334, "learning_rate": 0.0001, "loss": 4.7353, "loss/crossentropy": 2.2196428775787354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2563377171754837, "step": 3312 }, { "epoch": 0.06628, "grad_norm": 2.53125, "grad_norm_var": 0.14537760416666667, "learning_rate": 0.0001, "loss": 4.6907, "loss/crossentropy": 2.0861470699310303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27121224999427795, "step": 3314 }, { "epoch": 0.06632, "grad_norm": 2.53125, "grad_norm_var": 0.14287821451822916, "learning_rate": 0.0001, "loss": 4.6484, "loss/crossentropy": 1.9716283679008484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.277963787317276, "step": 3316 }, { "epoch": 0.06636, "grad_norm": 2.609375, "grad_norm_var": 0.1375, "learning_rate": 0.0001, "loss": 4.7985, "loss/crossentropy": 2.5604729652404785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31031742691993713, "step": 3318 }, { "epoch": 0.0664, "grad_norm": 2.71875, "grad_norm_var": 0.022337849934895834, "learning_rate": 0.0001, "loss": 5.4329, "loss/crossentropy": 2.2672252655029297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29225634038448334, "step": 3320 }, { "epoch": 0.06644, "grad_norm": 2.671875, "grad_norm_var": 0.020015462239583334, "learning_rate": 0.0001, "loss": 5.0557, "loss/crossentropy": 2.0992931723594666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2648598402738571, "step": 3322 }, { "epoch": 0.06648, "grad_norm": 2.65625, "grad_norm_var": 0.014827473958333334, "learning_rate": 0.0001, "loss": 5.0895, "loss/crossentropy": 2.208917260169983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2998420298099518, "step": 3324 }, { "epoch": 0.06652, "grad_norm": 2.671875, "grad_norm_var": 0.012791951497395834, "learning_rate": 0.0001, "loss": 4.8834, "loss/crossentropy": 2.290796995162964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2638905793428421, "step": 3326 }, { "epoch": 0.06656, "grad_norm": 2.375, "grad_norm_var": 0.012214152018229167, "learning_rate": 0.0001, "loss": 4.8521, "loss/crossentropy": 2.4156445264816284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28743064403533936, "step": 3328 }, { "epoch": 0.0666, "grad_norm": 2.734375, "grad_norm_var": 0.012132771809895833, "learning_rate": 0.0001, "loss": 5.2205, "loss/crossentropy": 2.4604904651641846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27585768699645996, "step": 3330 }, { "epoch": 0.06664, "grad_norm": 2.421875, "grad_norm_var": 0.013818359375, "learning_rate": 0.0001, "loss": 4.8296, "loss/crossentropy": 1.8613844513893127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27038969844579697, "step": 3332 }, { "epoch": 0.06668, "grad_norm": 2.53125, "grad_norm_var": 0.014046223958333333, "learning_rate": 0.0001, "loss": 4.8777, "loss/crossentropy": 2.232776403427124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2686954140663147, "step": 3334 }, { "epoch": 0.06672, "grad_norm": 2.5625, "grad_norm_var": 0.0120758056640625, "learning_rate": 0.0001, "loss": 4.9536, "loss/crossentropy": 2.070033550262451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26176171004772186, "step": 3336 }, { "epoch": 0.06676, "grad_norm": 2.4375, "grad_norm_var": 0.01187744140625, "learning_rate": 0.0001, "loss": 4.8616, "loss/crossentropy": 2.38937509059906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2539200633764267, "step": 3338 }, { "epoch": 0.0668, "grad_norm": 2.53125, "grad_norm_var": 0.010933430989583333, "learning_rate": 0.0001, "loss": 5.1104, "loss/crossentropy": 2.272845983505249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2856733053922653, "step": 3340 }, { "epoch": 0.06684, "grad_norm": 2.5625, "grad_norm_var": 0.01181640625, "learning_rate": 0.0001, "loss": 4.7935, "loss/crossentropy": 2.427489161491394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30572691559791565, "step": 3342 }, { "epoch": 0.06688, "grad_norm": 2.59375, "grad_norm_var": 0.009407552083333333, "learning_rate": 0.0001, "loss": 4.9801, "loss/crossentropy": 2.4701327085494995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28469331562519073, "step": 3344 }, { "epoch": 0.06692, "grad_norm": 2.421875, "grad_norm_var": 0.006917317708333333, "learning_rate": 0.0001, "loss": 4.815, "loss/crossentropy": 2.0733558535575867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.292859822511673, "step": 3346 }, { "epoch": 0.06696, "grad_norm": 2.640625, "grad_norm_var": 0.0061187744140625, "learning_rate": 0.0001, "loss": 4.8379, "loss/crossentropy": 2.301755905151367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3174070864915848, "step": 3348 }, { "epoch": 0.067, "grad_norm": 2.234375, "grad_norm_var": 0.012580362955729167, "learning_rate": 0.0001, "loss": 4.3359, "loss/crossentropy": 2.039419114589691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24543144553899765, "step": 3350 }, { "epoch": 0.06704, "grad_norm": 2.4375, "grad_norm_var": 0.0144683837890625, "learning_rate": 0.0001, "loss": 4.7938, "loss/crossentropy": 1.9247611165046692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2595005929470062, "step": 3352 }, { "epoch": 0.06708, "grad_norm": 2.34375, "grad_norm_var": 0.01630859375, "learning_rate": 0.0001, "loss": 4.8294, "loss/crossentropy": 2.224974751472473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26880575716495514, "step": 3354 }, { "epoch": 0.06712, "grad_norm": 2.734375, "grad_norm_var": 0.0195220947265625, "learning_rate": 0.0001, "loss": 4.8922, "loss/crossentropy": 2.2601993083953857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.279633030295372, "step": 3356 }, { "epoch": 0.06716, "grad_norm": 2.4375, "grad_norm_var": 0.017085774739583334, "learning_rate": 0.0001, "loss": 4.5911, "loss/crossentropy": 1.8156417608261108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22118539363145828, "step": 3358 }, { "epoch": 0.0672, "grad_norm": 2.734375, "grad_norm_var": 0.020140584309895834, "learning_rate": 0.0001, "loss": 5.2032, "loss/crossentropy": 2.207859516143799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2771998345851898, "step": 3360 }, { "epoch": 0.06724, "grad_norm": 2.453125, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 4.9352, "loss/crossentropy": 1.9886083602905273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545766308903694, "step": 3362 }, { "epoch": 0.06728, "grad_norm": 2.234375, "grad_norm_var": 0.025777180989583332, "learning_rate": 0.0001, "loss": 4.4323, "loss/crossentropy": 1.7046592235565186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2207612618803978, "step": 3364 }, { "epoch": 0.06732, "grad_norm": 2.453125, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 5.0209, "loss/crossentropy": 2.0283663868904114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2743644416332245, "step": 3366 }, { "epoch": 0.06736, "grad_norm": 2.453125, "grad_norm_var": 0.019498697916666665, "learning_rate": 0.0001, "loss": 4.4788, "loss/crossentropy": 1.975772500038147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505730241537094, "step": 3368 }, { "epoch": 0.0674, "grad_norm": 2.5, "grad_norm_var": 0.019950358072916667, "learning_rate": 0.0001, "loss": 4.6863, "loss/crossentropy": 1.9021872282028198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25430944561958313, "step": 3370 }, { "epoch": 0.06744, "grad_norm": 2.671875, "grad_norm_var": 0.0182037353515625, "learning_rate": 0.0001, "loss": 5.1859, "loss/crossentropy": 2.3888463973999023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27896443009376526, "step": 3372 }, { "epoch": 0.06748, "grad_norm": 2.71875, "grad_norm_var": 0.02086181640625, "learning_rate": 0.0001, "loss": 4.6041, "loss/crossentropy": 1.7844690680503845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536455988883972, "step": 3374 }, { "epoch": 0.06752, "grad_norm": 2.703125, "grad_norm_var": 0.021442667643229166, "learning_rate": 0.0001, "loss": 4.4538, "loss/crossentropy": 1.919598639011383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26129309833049774, "step": 3376 }, { "epoch": 0.06756, "grad_norm": 2.671875, "grad_norm_var": 0.022835286458333333, "learning_rate": 0.0001, "loss": 5.0568, "loss/crossentropy": 2.2292110919952393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25612247735261917, "step": 3378 }, { "epoch": 0.0676, "grad_norm": 3.4375, "grad_norm_var": 0.0779296875, "learning_rate": 0.0001, "loss": 4.8674, "loss/crossentropy": 2.2235841751098633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2607487738132477, "step": 3380 }, { "epoch": 0.06764, "grad_norm": 2.34375, "grad_norm_var": 0.08089192708333333, "learning_rate": 0.0001, "loss": 4.5872, "loss/crossentropy": 2.1375235319137573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2344205006957054, "step": 3382 }, { "epoch": 0.06768, "grad_norm": 2.28125, "grad_norm_var": 0.08501688639322917, "learning_rate": 0.0001, "loss": 4.6076, "loss/crossentropy": 2.020237445831299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2550392523407936, "step": 3384 }, { "epoch": 0.06772, "grad_norm": 2.734375, "grad_norm_var": 0.07911783854166667, "learning_rate": 0.0001, "loss": 4.6939, "loss/crossentropy": 2.138959765434265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2742607295513153, "step": 3386 }, { "epoch": 0.06776, "grad_norm": 2.578125, "grad_norm_var": 0.07864481608072917, "learning_rate": 0.0001, "loss": 4.9065, "loss/crossentropy": 2.521559953689575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2892928719520569, "step": 3388 }, { "epoch": 0.0678, "grad_norm": 2.375, "grad_norm_var": 0.07815348307291667, "learning_rate": 0.0001, "loss": 5.049, "loss/crossentropy": 2.169550120830536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27750439941883087, "step": 3390 }, { "epoch": 0.06784, "grad_norm": 2.6875, "grad_norm_var": 0.07464090983072917, "learning_rate": 0.0001, "loss": 4.7401, "loss/crossentropy": 1.975584864616394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557516545057297, "step": 3392 }, { "epoch": 0.06788, "grad_norm": 2.6875, "grad_norm_var": 0.07617899576822916, "learning_rate": 0.0001, "loss": 4.9207, "loss/crossentropy": 2.5837322473526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3444886952638626, "step": 3394 }, { "epoch": 0.06792, "grad_norm": 2.546875, "grad_norm_var": 0.022294108072916666, "learning_rate": 0.0001, "loss": 4.743, "loss/crossentropy": 1.963110864162445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24120041728019714, "step": 3396 }, { "epoch": 0.06796, "grad_norm": 2.65625, "grad_norm_var": 0.0211090087890625, "learning_rate": 0.0001, "loss": 4.5759, "loss/crossentropy": 2.381603956222534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26319295167922974, "step": 3398 }, { "epoch": 0.068, "grad_norm": 2.25, "grad_norm_var": 0.023078409830729167, "learning_rate": 0.0001, "loss": 4.5642, "loss/crossentropy": 2.054026961326599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24540280550718307, "step": 3400 }, { "epoch": 0.06804, "grad_norm": 2.46875, "grad_norm_var": 0.0187652587890625, "learning_rate": 0.0001, "loss": 4.3661, "loss/crossentropy": 2.047453820705414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2524164840579033, "step": 3402 }, { "epoch": 0.06808, "grad_norm": 2.390625, "grad_norm_var": 0.018317667643229167, "learning_rate": 0.0001, "loss": 4.7839, "loss/crossentropy": 1.8616933226585388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23905867338180542, "step": 3404 }, { "epoch": 0.06812, "grad_norm": 2.4375, "grad_norm_var": 0.016422526041666666, "learning_rate": 0.0001, "loss": 4.6932, "loss/crossentropy": 2.4304568767547607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2802550047636032, "step": 3406 }, { "epoch": 0.06816, "grad_norm": 2.5, "grad_norm_var": 0.01318359375, "learning_rate": 0.0001, "loss": 4.7005, "loss/crossentropy": 1.8178748488426208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24266308546066284, "step": 3408 }, { "epoch": 0.0682, "grad_norm": 2.625, "grad_norm_var": 0.0116363525390625, "learning_rate": 0.0001, "loss": 5.0152, "loss/crossentropy": 2.025859773159027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.282450333237648, "step": 3410 }, { "epoch": 0.06824, "grad_norm": 2.53125, "grad_norm_var": 0.010383097330729167, "learning_rate": 0.0001, "loss": 4.737, "loss/crossentropy": 2.032994568347931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26047058403491974, "step": 3412 }, { "epoch": 0.06828, "grad_norm": 2.46875, "grad_norm_var": 0.007840983072916667, "learning_rate": 0.0001, "loss": 4.6428, "loss/crossentropy": 2.2468607425689697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2814445495605469, "step": 3414 }, { "epoch": 0.06832, "grad_norm": 2.75, "grad_norm_var": 0.03629150390625, "learning_rate": 0.0001, "loss": 4.6854, "loss/crossentropy": 2.2534161806106567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3231179416179657, "step": 3416 }, { "epoch": 0.06836, "grad_norm": 2.65625, "grad_norm_var": 0.03443603515625, "learning_rate": 0.0001, "loss": 4.9599, "loss/crossentropy": 2.1678181886672974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2840191572904587, "step": 3418 }, { "epoch": 0.0684, "grad_norm": 2.328125, "grad_norm_var": 0.03453776041666667, "learning_rate": 0.0001, "loss": 4.8667, "loss/crossentropy": 2.053459882736206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.271483838558197, "step": 3420 }, { "epoch": 0.06844, "grad_norm": 2.53125, "grad_norm_var": 0.03243815104166667, "learning_rate": 0.0001, "loss": 4.714, "loss/crossentropy": 2.2278919219970703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2781473994255066, "step": 3422 }, { "epoch": 0.06848, "grad_norm": 2.296875, "grad_norm_var": 0.0384674072265625, "learning_rate": 0.0001, "loss": 4.5262, "loss/crossentropy": 1.9582479000091553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24171434342861176, "step": 3424 }, { "epoch": 0.06852, "grad_norm": 2.359375, "grad_norm_var": 0.04121805826822917, "learning_rate": 0.0001, "loss": 4.4734, "loss/crossentropy": 1.808964192867279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22458729147911072, "step": 3426 }, { "epoch": 0.06856, "grad_norm": 2.40625, "grad_norm_var": 0.04296468098958333, "learning_rate": 0.0001, "loss": 4.464, "loss/crossentropy": 1.9225260019302368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2508438527584076, "step": 3428 }, { "epoch": 0.0686, "grad_norm": 2.578125, "grad_norm_var": 0.044465128580729166, "learning_rate": 0.0001, "loss": 4.9647, "loss/crossentropy": 2.2446881532669067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26919613778591156, "step": 3430 }, { "epoch": 0.06864, "grad_norm": 2.515625, "grad_norm_var": 0.017899576822916666, "learning_rate": 0.0001, "loss": 4.6865, "loss/crossentropy": 2.2047033309936523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2581355720758438, "step": 3432 }, { "epoch": 0.06868, "grad_norm": 2.46875, "grad_norm_var": 0.015729777018229165, "learning_rate": 0.0001, "loss": 4.7139, "loss/crossentropy": 2.1223543882369995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25814756751060486, "step": 3434 }, { "epoch": 0.06872, "grad_norm": 2.59375, "grad_norm_var": 0.016813151041666665, "learning_rate": 0.0001, "loss": 4.6771, "loss/crossentropy": 2.1641053557395935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25222497433423996, "step": 3436 }, { "epoch": 0.06876, "grad_norm": 2.640625, "grad_norm_var": 0.0189605712890625, "learning_rate": 0.0001, "loss": 4.6972, "loss/crossentropy": 1.9569795727729797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2681007981300354, "step": 3438 }, { "epoch": 0.0688, "grad_norm": 2.453125, "grad_norm_var": 0.013216145833333333, "learning_rate": 0.0001, "loss": 4.8214, "loss/crossentropy": 2.212220251560211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28191742300987244, "step": 3440 }, { "epoch": 0.06884, "grad_norm": 2.34375, "grad_norm_var": 0.011546834309895834, "learning_rate": 0.0001, "loss": 4.4162, "loss/crossentropy": 1.9564262628555298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560906559228897, "step": 3442 }, { "epoch": 0.06888, "grad_norm": 2.46875, "grad_norm_var": 0.012043253580729166, "learning_rate": 0.0001, "loss": 4.6837, "loss/crossentropy": 1.8553346395492554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24626825004816055, "step": 3444 }, { "epoch": 0.06892, "grad_norm": 2.3125, "grad_norm_var": 0.012007649739583333, "learning_rate": 0.0001, "loss": 4.7914, "loss/crossentropy": 1.9803723692893982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2712126225233078, "step": 3446 }, { "epoch": 0.06896, "grad_norm": 2.46875, "grad_norm_var": 0.010423787434895833, "learning_rate": 0.0001, "loss": 4.857, "loss/crossentropy": 1.9914751648902893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565220594406128, "step": 3448 }, { "epoch": 0.069, "grad_norm": 2.484375, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 4.8013, "loss/crossentropy": 2.2114094495773315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28365010023117065, "step": 3450 }, { "epoch": 0.06904, "grad_norm": 2.453125, "grad_norm_var": 0.009577433268229166, "learning_rate": 0.0001, "loss": 5.0322, "loss/crossentropy": 2.4366514682769775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30788426101207733, "step": 3452 }, { "epoch": 0.06908, "grad_norm": 2.671875, "grad_norm_var": 0.010595703125, "learning_rate": 0.0001, "loss": 4.7902, "loss/crossentropy": 2.304569959640503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835633158683777, "step": 3454 }, { "epoch": 0.06912, "grad_norm": 2.65625, "grad_norm_var": 0.027318318684895832, "learning_rate": 0.0001, "loss": 5.151, "loss/crossentropy": 2.2518080472946167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.278719499707222, "step": 3456 }, { "epoch": 0.06916, "grad_norm": 2.71875, "grad_norm_var": 0.027904256184895834, "learning_rate": 0.0001, "loss": 5.1688, "loss/crossentropy": 2.333768129348755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28694969415664673, "step": 3458 }, { "epoch": 0.0692, "grad_norm": 2.46875, "grad_norm_var": 0.0260650634765625, "learning_rate": 0.0001, "loss": 5.1421, "loss/crossentropy": 2.3534432649612427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3005864769220352, "step": 3460 }, { "epoch": 0.06924, "grad_norm": 2.4375, "grad_norm_var": 0.022981770833333335, "learning_rate": 0.0001, "loss": 4.9502, "loss/crossentropy": 2.0703017711639404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25969168543815613, "step": 3462 }, { "epoch": 0.06928, "grad_norm": 2.65625, "grad_norm_var": 0.023502604166666666, "learning_rate": 0.0001, "loss": 5.0258, "loss/crossentropy": 2.167420506477356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32322582602500916, "step": 3464 }, { "epoch": 0.06932, "grad_norm": 2.515625, "grad_norm_var": 0.021222941080729165, "learning_rate": 0.0001, "loss": 4.5342, "loss/crossentropy": 2.0845181941986084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26804475486278534, "step": 3466 }, { "epoch": 0.06936, "grad_norm": 2.4375, "grad_norm_var": 0.021219889322916668, "learning_rate": 0.0001, "loss": 5.0459, "loss/crossentropy": 2.4165114164352417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2866530567407608, "step": 3468 }, { "epoch": 0.0694, "grad_norm": 2.765625, "grad_norm_var": 0.022359212239583332, "learning_rate": 0.0001, "loss": 5.0938, "loss/crossentropy": 2.4152863025665283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30185502767562866, "step": 3470 }, { "epoch": 0.06944, "grad_norm": 2.28125, "grad_norm_var": 0.014924112955729167, "learning_rate": 0.0001, "loss": 4.6362, "loss/crossentropy": 2.1878501176834106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2736133933067322, "step": 3472 }, { "epoch": 0.06948, "grad_norm": 2.4375, "grad_norm_var": 0.015262858072916666, "learning_rate": 0.0001, "loss": 4.812, "loss/crossentropy": 2.055173695087433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2609568238258362, "step": 3474 }, { "epoch": 0.06952, "grad_norm": 2.453125, "grad_norm_var": 0.015550740559895833, "learning_rate": 0.0001, "loss": 4.8201, "loss/crossentropy": 2.050000250339508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2713121324777603, "step": 3476 }, { "epoch": 0.06956, "grad_norm": 2.65625, "grad_norm_var": 0.019169108072916666, "learning_rate": 0.0001, "loss": 4.9916, "loss/crossentropy": 2.227464199066162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2710970640182495, "step": 3478 }, { "epoch": 0.0696, "grad_norm": 2.46875, "grad_norm_var": 0.016828409830729165, "learning_rate": 0.0001, "loss": 4.7435, "loss/crossentropy": 2.096015691757202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29556676745414734, "step": 3480 }, { "epoch": 0.06964, "grad_norm": 2.921875, "grad_norm_var": 0.028804524739583334, "learning_rate": 0.0001, "loss": 4.6738, "loss/crossentropy": 1.9252901673316956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2976933419704437, "step": 3482 }, { "epoch": 0.06968, "grad_norm": 2.25, "grad_norm_var": 0.03385009765625, "learning_rate": 0.0001, "loss": 4.7679, "loss/crossentropy": 2.258090019226074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27399829030036926, "step": 3484 }, { "epoch": 0.06972, "grad_norm": 2.265625, "grad_norm_var": 0.03192952473958333, "learning_rate": 0.0001, "loss": 4.7614, "loss/crossentropy": 2.2776867151260376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29224833846092224, "step": 3486 }, { "epoch": 0.06976, "grad_norm": 2.703125, "grad_norm_var": 0.031819661458333336, "learning_rate": 0.0001, "loss": 5.1628, "loss/crossentropy": 2.097061276435852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26103661954402924, "step": 3488 }, { "epoch": 0.0698, "grad_norm": 2.28125, "grad_norm_var": 0.03264567057291667, "learning_rate": 0.0001, "loss": 4.7364, "loss/crossentropy": 2.206419885158539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775641232728958, "step": 3490 }, { "epoch": 0.06984, "grad_norm": 2.59375, "grad_norm_var": 0.035563151041666664, "learning_rate": 0.0001, "loss": 4.7216, "loss/crossentropy": 1.962704062461853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2421455979347229, "step": 3492 }, { "epoch": 0.06988, "grad_norm": 2.4375, "grad_norm_var": 0.032698567708333334, "learning_rate": 0.0001, "loss": 4.7358, "loss/crossentropy": 2.223302483558655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2709382176399231, "step": 3494 }, { "epoch": 0.06992, "grad_norm": 2.390625, "grad_norm_var": 0.034764607747395836, "learning_rate": 0.0001, "loss": 4.8563, "loss/crossentropy": 2.259950876235962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29514479637145996, "step": 3496 }, { "epoch": 0.06996, "grad_norm": 2.359375, "grad_norm_var": 0.021141560872395833, "learning_rate": 0.0001, "loss": 4.6147, "loss/crossentropy": 2.2337416410446167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2585422098636627, "step": 3498 }, { "epoch": 0.07, "grad_norm": 2.75, "grad_norm_var": 0.022663370768229166, "learning_rate": 0.0001, "loss": 4.9278, "loss/crossentropy": 2.131904423236847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32493171095848083, "step": 3500 }, { "epoch": 0.07004, "grad_norm": 3.109375, "grad_norm_var": 0.04553629557291667, "learning_rate": 0.0001, "loss": 4.9285, "loss/crossentropy": 2.461912155151367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2917899489402771, "step": 3502 }, { "epoch": 0.07008, "grad_norm": 2.40625, "grad_norm_var": 0.04366861979166667, "learning_rate": 0.0001, "loss": 4.6466, "loss/crossentropy": 2.05659943819046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24786780774593353, "step": 3504 }, { "epoch": 0.07012, "grad_norm": 2.625, "grad_norm_var": 0.0412261962890625, "learning_rate": 0.0001, "loss": 4.7282, "loss/crossentropy": 2.3972705602645874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29012058675289154, "step": 3506 }, { "epoch": 0.07016, "grad_norm": 2.390625, "grad_norm_var": 0.03945210774739583, "learning_rate": 0.0001, "loss": 4.6561, "loss/crossentropy": 1.8465647101402283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26938022673130035, "step": 3508 }, { "epoch": 0.0702, "grad_norm": 2.65625, "grad_norm_var": 0.0434722900390625, "learning_rate": 0.0001, "loss": 4.9007, "loss/crossentropy": 2.2447493076324463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27260421216487885, "step": 3510 }, { "epoch": 0.07024, "grad_norm": 2.453125, "grad_norm_var": 0.042464192708333334, "learning_rate": 0.0001, "loss": 4.7774, "loss/crossentropy": 2.4258209466934204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.285652831196785, "step": 3512 }, { "epoch": 0.07028, "grad_norm": 2.328125, "grad_norm_var": 0.04248046875, "learning_rate": 0.0001, "loss": 4.7353, "loss/crossentropy": 2.1033068895339966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2628294378519058, "step": 3514 }, { "epoch": 0.07032, "grad_norm": 2.5, "grad_norm_var": 0.03819071451822917, "learning_rate": 0.0001, "loss": 4.8036, "loss/crossentropy": 2.2740964889526367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27228833734989166, "step": 3516 }, { "epoch": 0.07036, "grad_norm": 2.265625, "grad_norm_var": 0.015104166666666667, "learning_rate": 0.0001, "loss": 4.7699, "loss/crossentropy": 2.2315655946731567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26613348722457886, "step": 3518 }, { "epoch": 0.0704, "grad_norm": 2.359375, "grad_norm_var": 0.014720662434895834, "learning_rate": 0.0001, "loss": 4.6038, "loss/crossentropy": 1.9001839756965637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.248238705098629, "step": 3520 }, { "epoch": 0.07044, "grad_norm": 2.59375, "grad_norm_var": 0.013655598958333333, "learning_rate": 0.0001, "loss": 4.7034, "loss/crossentropy": 2.0940937399864197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28850243985652924, "step": 3522 }, { "epoch": 0.07048, "grad_norm": 2.421875, "grad_norm_var": 0.014338175455729166, "learning_rate": 0.0001, "loss": 4.9187, "loss/crossentropy": 1.9088054299354553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2577759325504303, "step": 3524 }, { "epoch": 0.07052, "grad_norm": 2.359375, "grad_norm_var": 0.01031494140625, "learning_rate": 0.0001, "loss": 4.7852, "loss/crossentropy": 2.1965672969818115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2696071192622185, "step": 3526 }, { "epoch": 0.07056, "grad_norm": 2.59375, "grad_norm_var": 0.010770670572916667, "learning_rate": 0.0001, "loss": 4.7657, "loss/crossentropy": 2.245758891105652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2646239101886749, "step": 3528 }, { "epoch": 0.0706, "grad_norm": 2.546875, "grad_norm_var": 0.013895670572916666, "learning_rate": 0.0001, "loss": 4.7794, "loss/crossentropy": 2.180204927921295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2582162171602249, "step": 3530 }, { "epoch": 0.07064, "grad_norm": 2.71875, "grad_norm_var": 0.019562784830729166, "learning_rate": 0.0001, "loss": 5.04, "loss/crossentropy": 2.193474531173706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40557755529880524, "step": 3532 }, { "epoch": 0.07068, "grad_norm": 2.78125, "grad_norm_var": 0.021361287434895834, "learning_rate": 0.0001, "loss": 4.9562, "loss/crossentropy": 2.2667607069015503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28741554915905, "step": 3534 }, { "epoch": 0.07072, "grad_norm": 2.3125, "grad_norm_var": 0.022093709309895834, "learning_rate": 0.0001, "loss": 4.8215, "loss/crossentropy": 1.9890388250350952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2612537443637848, "step": 3536 }, { "epoch": 0.07076, "grad_norm": 2.390625, "grad_norm_var": 0.02392578125, "learning_rate": 0.0001, "loss": 4.7544, "loss/crossentropy": 1.9390615820884705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23259633034467697, "step": 3538 }, { "epoch": 0.0708, "grad_norm": 2.515625, "grad_norm_var": 0.023193359375, "learning_rate": 0.0001, "loss": 4.8227, "loss/crossentropy": 2.3050389289855957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2738967537879944, "step": 3540 }, { "epoch": 0.07084, "grad_norm": 2.265625, "grad_norm_var": 0.025862630208333334, "learning_rate": 0.0001, "loss": 4.4323, "loss/crossentropy": 2.3832077980041504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29863911867141724, "step": 3542 }, { "epoch": 0.07088, "grad_norm": 2.4375, "grad_norm_var": 0.02822265625, "learning_rate": 0.0001, "loss": 4.7101, "loss/crossentropy": 1.8186699748039246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22366883605718613, "step": 3544 }, { "epoch": 0.07092, "grad_norm": 2.46875, "grad_norm_var": 0.024235026041666666, "learning_rate": 0.0001, "loss": 4.8151, "loss/crossentropy": 2.2650288343429565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2733971029520035, "step": 3546 }, { "epoch": 0.07096, "grad_norm": 2.53125, "grad_norm_var": 0.0191070556640625, "learning_rate": 0.0001, "loss": 4.7514, "loss/crossentropy": 2.432945966720581, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.269818976521492, "step": 3548 }, { "epoch": 0.071, "grad_norm": 4.15625, "grad_norm_var": 0.19250386555989582, "learning_rate": 0.0001, "loss": 4.9461, "loss/crossentropy": 2.021497666835785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2488800287246704, "step": 3550 }, { "epoch": 0.07104, "grad_norm": 2.3125, "grad_norm_var": 0.2001129150390625, "learning_rate": 0.0001, "loss": 5.0341, "loss/crossentropy": 2.0840535163879395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2780974507331848, "step": 3552 }, { "epoch": 0.07108, "grad_norm": 2.515625, "grad_norm_var": 0.19724833170572917, "learning_rate": 0.0001, "loss": 4.8123, "loss/crossentropy": 2.352238416671753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2830911874771118, "step": 3554 }, { "epoch": 0.07112, "grad_norm": 2.484375, "grad_norm_var": 0.19795633951822916, "learning_rate": 0.0001, "loss": 4.9632, "loss/crossentropy": 2.395397186279297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2861028015613556, "step": 3556 }, { "epoch": 0.07116, "grad_norm": 2.84375, "grad_norm_var": 0.19630533854166668, "learning_rate": 0.0001, "loss": 5.0203, "loss/crossentropy": 2.6454248428344727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28888577222824097, "step": 3558 }, { "epoch": 0.0712, "grad_norm": 2.609375, "grad_norm_var": 0.19394429524739584, "learning_rate": 0.0001, "loss": 4.8228, "loss/crossentropy": 2.21127188205719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2676347494125366, "step": 3560 }, { "epoch": 0.07124, "grad_norm": 2.5625, "grad_norm_var": 0.19245503743489584, "learning_rate": 0.0001, "loss": 4.7502, "loss/crossentropy": 2.0187097787857056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24967321753501892, "step": 3562 }, { "epoch": 0.07128, "grad_norm": 2.4375, "grad_norm_var": 0.19409077962239582, "learning_rate": 0.0001, "loss": 4.9548, "loss/crossentropy": 2.1822619438171387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24893341958522797, "step": 3564 }, { "epoch": 0.07132, "grad_norm": 2.65625, "grad_norm_var": 0.03178609212239583, "learning_rate": 0.0001, "loss": 5.0912, "loss/crossentropy": 2.486607313156128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29665203392505646, "step": 3566 }, { "epoch": 0.07136, "grad_norm": 2.546875, "grad_norm_var": 0.020048014322916665, "learning_rate": 0.0001, "loss": 5.0494, "loss/crossentropy": 2.2631163597106934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.268096499145031, "step": 3568 }, { "epoch": 0.0714, "grad_norm": 2.671875, "grad_norm_var": 0.02760009765625, "learning_rate": 0.0001, "loss": 4.7668, "loss/crossentropy": 2.3393882513046265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29145532846450806, "step": 3570 }, { "epoch": 0.07144, "grad_norm": 2.65625, "grad_norm_var": 0.03980712890625, "learning_rate": 0.0001, "loss": 4.8281, "loss/crossentropy": 2.007299244403839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30512700974941254, "step": 3572 }, { "epoch": 0.07148, "grad_norm": 2.609375, "grad_norm_var": 0.0369049072265625, "learning_rate": 0.0001, "loss": 4.7891, "loss/crossentropy": 1.9879329800605774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22510841488838196, "step": 3574 }, { "epoch": 0.07152, "grad_norm": 2.46875, "grad_norm_var": 0.03443603515625, "learning_rate": 0.0001, "loss": 4.9688, "loss/crossentropy": 2.387833833694458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3007010221481323, "step": 3576 }, { "epoch": 0.07156, "grad_norm": 2.625, "grad_norm_var": 0.03664957682291667, "learning_rate": 0.0001, "loss": 4.7975, "loss/crossentropy": 2.3306411504745483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.269440695643425, "step": 3578 }, { "epoch": 0.0716, "grad_norm": 2.5, "grad_norm_var": 0.03462626139322917, "learning_rate": 0.0001, "loss": 4.5348, "loss/crossentropy": 1.8359156847000122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23353691399097443, "step": 3580 }, { "epoch": 0.07164, "grad_norm": 2.5625, "grad_norm_var": 0.033080037434895834, "learning_rate": 0.0001, "loss": 4.9226, "loss/crossentropy": 2.257680654525757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2764490246772766, "step": 3582 }, { "epoch": 0.07168, "grad_norm": 2.4375, "grad_norm_var": 0.03310445149739583, "learning_rate": 0.0001, "loss": 4.6434, "loss/crossentropy": 2.589483857154846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28172188997268677, "step": 3584 }, { "epoch": 0.07172, "grad_norm": 2.390625, "grad_norm_var": 0.04182840983072917, "learning_rate": 0.0001, "loss": 4.5434, "loss/crossentropy": 1.8202016949653625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22330023348331451, "step": 3586 }, { "epoch": 0.07176, "grad_norm": 2.421875, "grad_norm_var": 0.028609212239583334, "learning_rate": 0.0001, "loss": 4.8344, "loss/crossentropy": 2.0311816334724426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2622206509113312, "step": 3588 }, { "epoch": 0.0718, "grad_norm": 2.296875, "grad_norm_var": 0.028400675455729166, "learning_rate": 0.0001, "loss": 4.5158, "loss/crossentropy": 1.7991753220558167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23093865811824799, "step": 3590 }, { "epoch": 0.07184, "grad_norm": 2.265625, "grad_norm_var": 0.030924479166666668, "learning_rate": 0.0001, "loss": 4.3148, "loss/crossentropy": 1.6141473054885864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21914401650428772, "step": 3592 }, { "epoch": 0.07188, "grad_norm": 2.34375, "grad_norm_var": 0.0289703369140625, "learning_rate": 0.0001, "loss": 4.8007, "loss/crossentropy": 2.3337208032608032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2612123265862465, "step": 3594 }, { "epoch": 0.07192, "grad_norm": 2.265625, "grad_norm_var": 0.0322662353515625, "learning_rate": 0.0001, "loss": 4.5811, "loss/crossentropy": 2.191028594970703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598741352558136, "step": 3596 }, { "epoch": 0.07196, "grad_norm": 2.546875, "grad_norm_var": 0.03178609212239583, "learning_rate": 0.0001, "loss": 4.9562, "loss/crossentropy": 2.0293691158294678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27143266797065735, "step": 3598 }, { "epoch": 0.072, "grad_norm": 2.640625, "grad_norm_var": 0.03437398274739583, "learning_rate": 0.0001, "loss": 5.1335, "loss/crossentropy": 2.1257725954055786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33300966024398804, "step": 3600 }, { "epoch": 0.07204, "grad_norm": 2.375, "grad_norm_var": 0.0171875, "learning_rate": 0.0001, "loss": 4.8223, "loss/crossentropy": 2.296278953552246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28724825382232666, "step": 3602 }, { "epoch": 0.07208, "grad_norm": 2.453125, "grad_norm_var": 0.016731770833333333, "learning_rate": 0.0001, "loss": 4.8925, "loss/crossentropy": 2.258358597755432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26510895788669586, "step": 3604 }, { "epoch": 0.07212, "grad_norm": 2.421875, "grad_norm_var": 0.0170318603515625, "learning_rate": 0.0001, "loss": 5.0383, "loss/crossentropy": 2.0454649925231934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2725224494934082, "step": 3606 }, { "epoch": 0.07216, "grad_norm": 2.390625, "grad_norm_var": 0.015250651041666667, "learning_rate": 0.0001, "loss": 4.6582, "loss/crossentropy": 2.1844204664230347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2729914039373398, "step": 3608 }, { "epoch": 0.0722, "grad_norm": 2.484375, "grad_norm_var": 0.015608723958333333, "learning_rate": 0.0001, "loss": 4.4613, "loss/crossentropy": 1.8897106647491455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26307350397109985, "step": 3610 }, { "epoch": 0.07224, "grad_norm": 2.5, "grad_norm_var": 0.013377888997395834, "learning_rate": 0.0001, "loss": 4.5695, "loss/crossentropy": 1.9441962838172913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24454529583454132, "step": 3612 }, { "epoch": 0.07228, "grad_norm": 2.6875, "grad_norm_var": 0.063623046875, "learning_rate": 0.0001, "loss": 4.7654, "loss/crossentropy": 2.10969078540802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2900787442922592, "step": 3614 }, { "epoch": 0.07232, "grad_norm": 2.453125, "grad_norm_var": 0.06301676432291667, "learning_rate": 0.0001, "loss": 4.5195, "loss/crossentropy": 2.1384644508361816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2722831964492798, "step": 3616 }, { "epoch": 0.07236, "grad_norm": 2.296875, "grad_norm_var": 0.06412760416666667, "learning_rate": 0.0001, "loss": 4.4995, "loss/crossentropy": 2.0648157596588135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24860350787639618, "step": 3618 }, { "epoch": 0.0724, "grad_norm": 2.421875, "grad_norm_var": 0.06441650390625, "learning_rate": 0.0001, "loss": 4.7863, "loss/crossentropy": 2.2188034057617188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2476331740617752, "step": 3620 }, { "epoch": 0.07244, "grad_norm": 2.4375, "grad_norm_var": 0.06516927083333333, "learning_rate": 0.0001, "loss": 4.792, "loss/crossentropy": 2.1361395120620728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24962469190359116, "step": 3622 }, { "epoch": 0.07248, "grad_norm": 2.46875, "grad_norm_var": 0.06457417805989583, "learning_rate": 0.0001, "loss": 4.739, "loss/crossentropy": 2.2646392583847046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.260420486330986, "step": 3624 }, { "epoch": 0.07252, "grad_norm": 2.421875, "grad_norm_var": 0.0630523681640625, "learning_rate": 0.0001, "loss": 4.6937, "loss/crossentropy": 2.2822424173355103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2585812509059906, "step": 3626 }, { "epoch": 0.07256, "grad_norm": 2.203125, "grad_norm_var": 0.06852925618489583, "learning_rate": 0.0001, "loss": 4.2868, "loss/crossentropy": 1.8197516798973083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24321961402893066, "step": 3628 }, { "epoch": 0.0726, "grad_norm": 2.4375, "grad_norm_var": 0.007649739583333333, "learning_rate": 0.0001, "loss": 4.4835, "loss/crossentropy": 2.1475032567977905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2687895894050598, "step": 3630 }, { "epoch": 0.07264, "grad_norm": 2.25, "grad_norm_var": 0.006538899739583334, "learning_rate": 0.0001, "loss": 4.2353, "loss/crossentropy": 1.9497992992401123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24942665547132492, "step": 3632 }, { "epoch": 0.07268, "grad_norm": 2.5, "grad_norm_var": 0.007059733072916667, "learning_rate": 0.0001, "loss": 4.7455, "loss/crossentropy": 1.8786492347717285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23713821917772293, "step": 3634 }, { "epoch": 0.07272, "grad_norm": 2.4375, "grad_norm_var": 0.00738525390625, "learning_rate": 0.0001, "loss": 5.089, "loss/crossentropy": 2.474532127380371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3133770227432251, "step": 3636 }, { "epoch": 0.07276, "grad_norm": 2.484375, "grad_norm_var": 0.0073394775390625, "learning_rate": 0.0001, "loss": 4.912, "loss/crossentropy": 2.1231455206871033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2547219544649124, "step": 3638 }, { "epoch": 0.0728, "grad_norm": 2.546875, "grad_norm_var": 0.009007771809895834, "learning_rate": 0.0001, "loss": 4.4727, "loss/crossentropy": 2.0511630177497864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26947300136089325, "step": 3640 }, { "epoch": 0.07284, "grad_norm": 2.5625, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 4.7332, "loss/crossentropy": 1.7076187133789062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21764510869979858, "step": 3642 }, { "epoch": 0.07288, "grad_norm": 2.359375, "grad_norm_var": 0.009723917643229166, "learning_rate": 0.0001, "loss": 4.8396, "loss/crossentropy": 2.069926142692566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26851218193769455, "step": 3644 }, { "epoch": 0.07292, "grad_norm": 2.609375, "grad_norm_var": 0.010054524739583333, "learning_rate": 0.0001, "loss": 5.0802, "loss/crossentropy": 2.1369277238845825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27878354489803314, "step": 3646 }, { "epoch": 0.07296, "grad_norm": 2.375, "grad_norm_var": 0.008072916666666667, "learning_rate": 0.0001, "loss": 4.3659, "loss/crossentropy": 2.095974624156952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25285808742046356, "step": 3648 }, { "epoch": 0.073, "grad_norm": 2.5625, "grad_norm_var": 0.011888631184895833, "learning_rate": 0.0001, "loss": 4.9301, "loss/crossentropy": 2.240627646446228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24894578754901886, "step": 3650 }, { "epoch": 0.07304, "grad_norm": 2.359375, "grad_norm_var": 0.013899739583333333, "learning_rate": 0.0001, "loss": 4.6588, "loss/crossentropy": 2.2270851135253906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26908691227436066, "step": 3652 }, { "epoch": 0.07308, "grad_norm": 2.671875, "grad_norm_var": 0.017039998372395834, "learning_rate": 0.0001, "loss": 4.7385, "loss/crossentropy": 2.2684017419815063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28442947566509247, "step": 3654 }, { "epoch": 0.07312, "grad_norm": 2.46875, "grad_norm_var": 0.07785542805989583, "learning_rate": 0.0001, "loss": 4.7585, "loss/crossentropy": 2.0922030806541443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2621888816356659, "step": 3656 }, { "epoch": 0.07316, "grad_norm": 2.3125, "grad_norm_var": 0.08323567708333333, "learning_rate": 0.0001, "loss": 4.7425, "loss/crossentropy": 2.0134947896003723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2717447876930237, "step": 3658 }, { "epoch": 0.0732, "grad_norm": 2.421875, "grad_norm_var": 0.08206278483072917, "learning_rate": 0.0001, "loss": 4.5573, "loss/crossentropy": 1.9246947765350342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25095127522945404, "step": 3660 }, { "epoch": 0.07324, "grad_norm": 2.5625, "grad_norm_var": 0.08561197916666667, "learning_rate": 0.0001, "loss": 4.8616, "loss/crossentropy": 2.0655113458633423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24166538566350937, "step": 3662 }, { "epoch": 0.07328, "grad_norm": 2.46875, "grad_norm_var": 0.08198954264322916, "learning_rate": 0.0001, "loss": 4.9137, "loss/crossentropy": 2.2706735730171204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26484307646751404, "step": 3664 }, { "epoch": 0.07332, "grad_norm": 2.359375, "grad_norm_var": 0.0883453369140625, "learning_rate": 0.0001, "loss": 4.2911, "loss/crossentropy": 1.7969809770584106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23407897353172302, "step": 3666 }, { "epoch": 0.07336, "grad_norm": 2.421875, "grad_norm_var": 0.09109700520833333, "learning_rate": 0.0001, "loss": 4.7081, "loss/crossentropy": 2.0398870706558228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2502745985984802, "step": 3668 }, { "epoch": 0.0734, "grad_norm": 2.40625, "grad_norm_var": 0.09381103515625, "learning_rate": 0.0001, "loss": 4.8738, "loss/crossentropy": 2.090283453464508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2936585247516632, "step": 3670 }, { "epoch": 0.07344, "grad_norm": 2.5625, "grad_norm_var": 0.035008748372395836, "learning_rate": 0.0001, "loss": 4.8305, "loss/crossentropy": 2.286925792694092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27951307594776154, "step": 3672 }, { "epoch": 0.07348, "grad_norm": 2.40625, "grad_norm_var": 0.032059733072916666, "learning_rate": 0.0001, "loss": 4.4885, "loss/crossentropy": 2.0264610052108765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24908316135406494, "step": 3674 }, { "epoch": 0.07352, "grad_norm": 2.6875, "grad_norm_var": 0.0372467041015625, "learning_rate": 0.0001, "loss": 4.9239, "loss/crossentropy": 2.1947755217552185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2865990996360779, "step": 3676 }, { "epoch": 0.07356, "grad_norm": 2.703125, "grad_norm_var": 0.0422515869140625, "learning_rate": 0.0001, "loss": 4.8784, "loss/crossentropy": 2.0050416588783264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23362033069133759, "step": 3678 }, { "epoch": 0.0736, "grad_norm": 2.4375, "grad_norm_var": 0.041112263997395836, "learning_rate": 0.0001, "loss": 4.7423, "loss/crossentropy": 1.8935424089431763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532464489340782, "step": 3680 }, { "epoch": 0.07364, "grad_norm": 2.40625, "grad_norm_var": 0.0350250244140625, "learning_rate": 0.0001, "loss": 4.7389, "loss/crossentropy": 2.0181053280830383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2544455900788307, "step": 3682 }, { "epoch": 0.07368, "grad_norm": 2.359375, "grad_norm_var": 0.03144124348958333, "learning_rate": 0.0001, "loss": 4.7099, "loss/crossentropy": 2.1172796487808228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27609144151210785, "step": 3684 }, { "epoch": 0.07372, "grad_norm": 2.375, "grad_norm_var": 0.023737589518229168, "learning_rate": 0.0001, "loss": 4.7185, "loss/crossentropy": 2.3926355838775635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28892992436885834, "step": 3686 }, { "epoch": 0.07376, "grad_norm": 2.21875, "grad_norm_var": 0.026883951822916665, "learning_rate": 0.0001, "loss": 4.9086, "loss/crossentropy": 2.2512835264205933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26166096329689026, "step": 3688 }, { "epoch": 0.0738, "grad_norm": 2.578125, "grad_norm_var": 0.0264312744140625, "learning_rate": 0.0001, "loss": 4.6311, "loss/crossentropy": 2.0656538009643555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25399819016456604, "step": 3690 }, { "epoch": 0.07384, "grad_norm": 2.453125, "grad_norm_var": 0.0222808837890625, "learning_rate": 0.0001, "loss": 4.9565, "loss/crossentropy": 2.454928994178772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2964669317007065, "step": 3692 }, { "epoch": 0.07388, "grad_norm": 2.4375, "grad_norm_var": 0.009859212239583333, "learning_rate": 0.0001, "loss": 4.5703, "loss/crossentropy": 1.988040804862976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27345800399780273, "step": 3694 }, { "epoch": 0.07392, "grad_norm": 2.671875, "grad_norm_var": 0.013752237955729166, "learning_rate": 0.0001, "loss": 4.9418, "loss/crossentropy": 1.910742998123169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24037255346775055, "step": 3696 }, { "epoch": 0.07396, "grad_norm": 2.421875, "grad_norm_var": 0.013700358072916667, "learning_rate": 0.0001, "loss": 4.6541, "loss/crossentropy": 2.19545578956604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2778017520904541, "step": 3698 }, { "epoch": 0.074, "grad_norm": 2.46875, "grad_norm_var": 0.0127838134765625, "learning_rate": 0.0001, "loss": 4.3839, "loss/crossentropy": 2.436691403388977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26254843175411224, "step": 3700 }, { "epoch": 0.07404, "grad_norm": 2.5, "grad_norm_var": 0.012398274739583333, "learning_rate": 0.0001, "loss": 4.833, "loss/crossentropy": 2.7458308935165405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27952516078948975, "step": 3702 }, { "epoch": 0.07408, "grad_norm": 2.78125, "grad_norm_var": 0.014876302083333333, "learning_rate": 0.0001, "loss": 4.8308, "loss/crossentropy": 2.2321633100509644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2633504122495651, "step": 3704 }, { "epoch": 0.07412, "grad_norm": 2.359375, "grad_norm_var": 0.013801066080729167, "learning_rate": 0.0001, "loss": 4.8159, "loss/crossentropy": 1.9883576035499573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25319087505340576, "step": 3706 }, { "epoch": 0.07416, "grad_norm": 2.4375, "grad_norm_var": 0.0147857666015625, "learning_rate": 0.0001, "loss": 4.5546, "loss/crossentropy": 1.7647870182991028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21732009947299957, "step": 3708 }, { "epoch": 0.0742, "grad_norm": 2.734375, "grad_norm_var": 0.046793619791666664, "learning_rate": 0.0001, "loss": 4.9271, "loss/crossentropy": 2.1113381385803223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25888970494270325, "step": 3710 }, { "epoch": 0.07424, "grad_norm": 2.34375, "grad_norm_var": 0.04690348307291667, "learning_rate": 0.0001, "loss": 4.5878, "loss/crossentropy": 1.975549578666687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23899925500154495, "step": 3712 }, { "epoch": 0.07428, "grad_norm": 2.40625, "grad_norm_var": 0.04664306640625, "learning_rate": 0.0001, "loss": 4.9262, "loss/crossentropy": 2.0562495589256287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31428879499435425, "step": 3714 }, { "epoch": 0.07432, "grad_norm": 2.265625, "grad_norm_var": 0.04951883951822917, "learning_rate": 0.0001, "loss": 4.3719, "loss/crossentropy": 2.114805221557617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25172004848718643, "step": 3716 }, { "epoch": 0.07436, "grad_norm": 2.65625, "grad_norm_var": 0.052783203125, "learning_rate": 0.0001, "loss": 4.6032, "loss/crossentropy": 2.1865739822387695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.252632200717926, "step": 3718 }, { "epoch": 0.0744, "grad_norm": 2.53125, "grad_norm_var": 0.04739176432291667, "learning_rate": 0.0001, "loss": 4.8493, "loss/crossentropy": 2.2550876140594482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27425335347652435, "step": 3720 }, { "epoch": 0.07444, "grad_norm": 2.640625, "grad_norm_var": 0.047379557291666666, "learning_rate": 0.0001, "loss": 4.9072, "loss/crossentropy": 2.293414354324341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26979324221611023, "step": 3722 }, { "epoch": 0.07448, "grad_norm": 2.59375, "grad_norm_var": 0.043745930989583334, "learning_rate": 0.0001, "loss": 4.4962, "loss/crossentropy": 2.014510452747345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25709769129753113, "step": 3724 }, { "epoch": 0.07452, "grad_norm": 2.40625, "grad_norm_var": 0.015746053059895834, "learning_rate": 0.0001, "loss": 4.6485, "loss/crossentropy": 2.0332603454589844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2585446834564209, "step": 3726 }, { "epoch": 0.07456, "grad_norm": 2.296875, "grad_norm_var": 0.0165191650390625, "learning_rate": 0.0001, "loss": 4.7268, "loss/crossentropy": 1.9425334930419922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22921039909124374, "step": 3728 }, { "epoch": 0.0746, "grad_norm": 2.71875, "grad_norm_var": 0.019090779622395835, "learning_rate": 0.0001, "loss": 4.9306, "loss/crossentropy": 2.1233898997306824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26014500856399536, "step": 3730 }, { "epoch": 0.07464, "grad_norm": 2.609375, "grad_norm_var": 0.023566691080729167, "learning_rate": 0.0001, "loss": 4.9958, "loss/crossentropy": 2.3929240703582764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2791339308023453, "step": 3732 }, { "epoch": 0.07468, "grad_norm": 2.546875, "grad_norm_var": 0.021370442708333333, "learning_rate": 0.0001, "loss": 4.6072, "loss/crossentropy": 2.163137674331665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26793332397937775, "step": 3734 }, { "epoch": 0.07472, "grad_norm": 2.453125, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 4.723, "loss/crossentropy": 2.1300129294395447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2533607929944992, "step": 3736 }, { "epoch": 0.07476, "grad_norm": 2.5, "grad_norm_var": 0.028055826822916668, "learning_rate": 0.0001, "loss": 4.7232, "loss/crossentropy": 1.9808942675590515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25632843375205994, "step": 3738 }, { "epoch": 0.0748, "grad_norm": 2.453125, "grad_norm_var": 0.03328348795572917, "learning_rate": 0.0001, "loss": 4.8219, "loss/crossentropy": 2.161437451839447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2529585212469101, "step": 3740 }, { "epoch": 0.07484, "grad_norm": 2.53125, "grad_norm_var": 0.03369140625, "learning_rate": 0.0001, "loss": 4.6541, "loss/crossentropy": 1.852737545967102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2333993762731552, "step": 3742 }, { "epoch": 0.07488, "grad_norm": 2.265625, "grad_norm_var": 0.03439127604166667, "learning_rate": 0.0001, "loss": 4.4355, "loss/crossentropy": 1.664733350276947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20309723168611526, "step": 3744 }, { "epoch": 0.07492, "grad_norm": 2.203125, "grad_norm_var": 0.03276265462239583, "learning_rate": 0.0001, "loss": 4.4554, "loss/crossentropy": 2.1815799474716187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2470541000366211, "step": 3746 }, { "epoch": 0.07496, "grad_norm": 2.421875, "grad_norm_var": 0.024657185872395834, "learning_rate": 0.0001, "loss": 4.7423, "loss/crossentropy": 1.9546562433242798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2685271203517914, "step": 3748 }, { "epoch": 0.075, "grad_norm": 2.25, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 4.5171, "loss/crossentropy": 1.920817255973816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2558089941740036, "step": 3750 }, { "epoch": 0.07504, "grad_norm": 2.265625, "grad_norm_var": 0.024982706705729166, "learning_rate": 0.0001, "loss": 4.4795, "loss/crossentropy": 1.7570490837097168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265816479921341, "step": 3752 }, { "epoch": 0.07508, "grad_norm": 2.890625, "grad_norm_var": 0.203564453125, "learning_rate": 0.0001, "loss": 4.756, "loss/crossentropy": 2.1815105676651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536233216524124, "step": 3754 }, { "epoch": 0.07512, "grad_norm": 2.4375, "grad_norm_var": 0.19650777180989584, "learning_rate": 0.0001, "loss": 4.7178, "loss/crossentropy": 2.1385504603385925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2445325404405594, "step": 3756 }, { "epoch": 0.07516, "grad_norm": 2.34375, "grad_norm_var": 0.19650777180989584, "learning_rate": 0.0001, "loss": 4.6449, "loss/crossentropy": 1.7325092554092407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419673353433609, "step": 3758 }, { "epoch": 0.0752, "grad_norm": 2.21875, "grad_norm_var": 0.19572652180989583, "learning_rate": 0.0001, "loss": 4.6096, "loss/crossentropy": 2.358627676963806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2713842839002609, "step": 3760 }, { "epoch": 0.07524, "grad_norm": 3.078125, "grad_norm_var": 0.20392252604166666, "learning_rate": 0.0001, "loss": 5.1712, "loss/crossentropy": 2.048672080039978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3007172644138336, "step": 3762 }, { "epoch": 0.07528, "grad_norm": 2.609375, "grad_norm_var": 0.19885660807291666, "learning_rate": 0.0001, "loss": 4.8473, "loss/crossentropy": 2.2967183589935303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28548331558704376, "step": 3764 }, { "epoch": 0.07532, "grad_norm": 2.3125, "grad_norm_var": 0.19228108723958334, "learning_rate": 0.0001, "loss": 4.7541, "loss/crossentropy": 2.1280438899993896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2825516611337662, "step": 3766 }, { "epoch": 0.07536, "grad_norm": 2.375, "grad_norm_var": 0.19146728515625, "learning_rate": 0.0001, "loss": 4.8404, "loss/crossentropy": 2.5528002977371216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28681764006614685, "step": 3768 }, { "epoch": 0.0754, "grad_norm": 2.546875, "grad_norm_var": 0.0549468994140625, "learning_rate": 0.0001, "loss": 4.729, "loss/crossentropy": 2.235885262489319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.259520560503006, "step": 3770 }, { "epoch": 0.07544, "grad_norm": 2.421875, "grad_norm_var": 0.05718994140625, "learning_rate": 0.0001, "loss": 4.4705, "loss/crossentropy": 1.8836966753005981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2360890954732895, "step": 3772 }, { "epoch": 0.07548, "grad_norm": 2.6875, "grad_norm_var": 0.056538899739583336, "learning_rate": 0.0001, "loss": 4.9291, "loss/crossentropy": 2.3396376371383667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30392636358737946, "step": 3774 }, { "epoch": 0.07552, "grad_norm": 2.359375, "grad_norm_var": 0.051301066080729166, "learning_rate": 0.0001, "loss": 4.7518, "loss/crossentropy": 2.4024877548217773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2858506590127945, "step": 3776 }, { "epoch": 0.07556, "grad_norm": 2.25, "grad_norm_var": 0.0353515625, "learning_rate": 0.0001, "loss": 4.4073, "loss/crossentropy": 2.138229727745056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25458595901727676, "step": 3778 }, { "epoch": 0.0756, "grad_norm": 2.328125, "grad_norm_var": 0.0359375, "learning_rate": 0.0001, "loss": 4.3882, "loss/crossentropy": 1.8413254618644714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23022352159023285, "step": 3780 }, { "epoch": 0.07564, "grad_norm": 2.390625, "grad_norm_var": 0.034764607747395836, "learning_rate": 0.0001, "loss": 4.613, "loss/crossentropy": 1.8554572463035583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25644390285015106, "step": 3782 }, { "epoch": 0.07568, "grad_norm": 2.34375, "grad_norm_var": 0.018452962239583332, "learning_rate": 0.0001, "loss": 4.7013, "loss/crossentropy": 2.0096731781959534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25370142608880997, "step": 3784 }, { "epoch": 0.07572, "grad_norm": 2.65625, "grad_norm_var": 0.020018513997395834, "learning_rate": 0.0001, "loss": 4.9317, "loss/crossentropy": 1.7932413220405579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2426912784576416, "step": 3786 }, { "epoch": 0.07576, "grad_norm": 2.125, "grad_norm_var": 0.0263336181640625, "learning_rate": 0.0001, "loss": 4.1599, "loss/crossentropy": 2.0372042655944824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24203844368457794, "step": 3788 }, { "epoch": 0.0758, "grad_norm": 2.296875, "grad_norm_var": 0.021773274739583334, "learning_rate": 0.0001, "loss": 4.3627, "loss/crossentropy": 1.8986076712608337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24614746868610382, "step": 3790 }, { "epoch": 0.07584, "grad_norm": 2.234375, "grad_norm_var": 0.022135416666666668, "learning_rate": 0.0001, "loss": 4.563, "loss/crossentropy": 1.8080393075942993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23206676542758942, "step": 3792 }, { "epoch": 0.07588, "grad_norm": 2.25, "grad_norm_var": 0.015067545572916667, "learning_rate": 0.0001, "loss": 4.6857, "loss/crossentropy": 1.7578041553497314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21686340868473053, "step": 3794 }, { "epoch": 0.07592, "grad_norm": 2.40625, "grad_norm_var": 0.015087890625, "learning_rate": 0.0001, "loss": 4.4938, "loss/crossentropy": 2.0115376710891724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2651440501213074, "step": 3796 }, { "epoch": 0.07596, "grad_norm": 2.40625, "grad_norm_var": 0.015803019205729168, "learning_rate": 0.0001, "loss": 4.598, "loss/crossentropy": 2.028555393218994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26161982119083405, "step": 3798 }, { "epoch": 0.076, "grad_norm": 2.515625, "grad_norm_var": 0.016429646809895834, "learning_rate": 0.0001, "loss": 4.8315, "loss/crossentropy": 2.158663272857666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28110067546367645, "step": 3800 }, { "epoch": 0.07604, "grad_norm": 2.734375, "grad_norm_var": 0.019652303059895834, "learning_rate": 0.0001, "loss": 5.2376, "loss/crossentropy": 2.2959556579589844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28477419912815094, "step": 3802 }, { "epoch": 0.07608, "grad_norm": 2.296875, "grad_norm_var": 0.014997355143229167, "learning_rate": 0.0001, "loss": 4.5289, "loss/crossentropy": 2.149766206741333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25216003507375717, "step": 3804 }, { "epoch": 0.07612, "grad_norm": 2.390625, "grad_norm_var": 0.01416015625, "learning_rate": 0.0001, "loss": 4.7496, "loss/crossentropy": 1.9866302609443665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26193149387836456, "step": 3806 }, { "epoch": 0.07616, "grad_norm": 2.46875, "grad_norm_var": 0.0122955322265625, "learning_rate": 0.0001, "loss": 4.9968, "loss/crossentropy": 2.4230403900146484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27897247672080994, "step": 3808 }, { "epoch": 0.0762, "grad_norm": 2.65625, "grad_norm_var": 0.012613932291666666, "learning_rate": 0.0001, "loss": 4.9133, "loss/crossentropy": 2.2995522022247314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27623192965984344, "step": 3810 }, { "epoch": 0.07624, "grad_norm": 2.484375, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 4.6632, "loss/crossentropy": 2.167468547821045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26121728122234344, "step": 3812 }, { "epoch": 0.07628, "grad_norm": 2.5, "grad_norm_var": 0.013353474934895833, "learning_rate": 0.0001, "loss": 4.8435, "loss/crossentropy": 2.3259944915771484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29026439785957336, "step": 3814 }, { "epoch": 0.07632, "grad_norm": 2.46875, "grad_norm_var": 0.041825358072916666, "learning_rate": 0.0001, "loss": 4.8704, "loss/crossentropy": 2.18080472946167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2550061345100403, "step": 3816 }, { "epoch": 0.07636, "grad_norm": 2.390625, "grad_norm_var": 0.03871968587239583, "learning_rate": 0.0001, "loss": 4.5681, "loss/crossentropy": 2.1185330748558044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25014883279800415, "step": 3818 }, { "epoch": 0.0764, "grad_norm": 2.28125, "grad_norm_var": 0.039383951822916666, "learning_rate": 0.0001, "loss": 4.5776, "loss/crossentropy": 1.9028193354606628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22502654790878296, "step": 3820 }, { "epoch": 0.07644, "grad_norm": 2.640625, "grad_norm_var": 0.039713541666666664, "learning_rate": 0.0001, "loss": 5.0188, "loss/crossentropy": 2.266402840614319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24728389829397202, "step": 3822 }, { "epoch": 0.07648, "grad_norm": 2.546875, "grad_norm_var": 0.04279683430989583, "learning_rate": 0.0001, "loss": 4.7036, "loss/crossentropy": 2.0918440222740173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2669295519590378, "step": 3824 }, { "epoch": 0.07652, "grad_norm": 2.296875, "grad_norm_var": 0.04456380208333333, "learning_rate": 0.0001, "loss": 4.1446, "loss/crossentropy": 1.6120481491088867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21910656243562698, "step": 3826 }, { "epoch": 0.07656, "grad_norm": 2.609375, "grad_norm_var": 0.044331868489583336, "learning_rate": 0.0001, "loss": 4.6352, "loss/crossentropy": 2.2294809818267822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2720891535282135, "step": 3828 }, { "epoch": 0.0766, "grad_norm": 2.765625, "grad_norm_var": 0.0524322509765625, "learning_rate": 0.0001, "loss": 4.873, "loss/crossentropy": 2.2588730454444885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2613115608692169, "step": 3830 }, { "epoch": 0.07664, "grad_norm": 2.28125, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 4.5881, "loss/crossentropy": 2.2658292055130005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2729395925998688, "step": 3832 }, { "epoch": 0.07668, "grad_norm": 2.484375, "grad_norm_var": 0.02662353515625, "learning_rate": 0.0001, "loss": 5.1787, "loss/crossentropy": 2.314574718475342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28539060056209564, "step": 3834 }, { "epoch": 0.07672, "grad_norm": 2.546875, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 5.0388, "loss/crossentropy": 2.4684417247772217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29843954741954803, "step": 3836 }, { "epoch": 0.07676, "grad_norm": 2.546875, "grad_norm_var": 0.025944010416666666, "learning_rate": 0.0001, "loss": 4.5976, "loss/crossentropy": 2.528733253479004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2696636766195297, "step": 3838 }, { "epoch": 0.0768, "grad_norm": 2.390625, "grad_norm_var": 0.025340779622395834, "learning_rate": 0.0001, "loss": 4.6449, "loss/crossentropy": 2.203901529312134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26598773896694183, "step": 3840 }, { "epoch": 0.07684, "grad_norm": 2.5625, "grad_norm_var": 0.021907552083333334, "learning_rate": 0.0001, "loss": 4.7322, "loss/crossentropy": 1.9192892909049988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25631098449230194, "step": 3842 }, { "epoch": 0.07688, "grad_norm": 2.3125, "grad_norm_var": 0.022077433268229165, "learning_rate": 0.0001, "loss": 4.6372, "loss/crossentropy": 1.8314838409423828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22842589765787125, "step": 3844 }, { "epoch": 0.07692, "grad_norm": 2.3125, "grad_norm_var": 0.013395182291666667, "learning_rate": 0.0001, "loss": 4.6023, "loss/crossentropy": 2.2416744232177734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24890189617872238, "step": 3846 }, { "epoch": 0.07696, "grad_norm": 2.46875, "grad_norm_var": 0.012580362955729167, "learning_rate": 0.0001, "loss": 4.8454, "loss/crossentropy": 2.034749209880829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29580502212047577, "step": 3848 }, { "epoch": 0.077, "grad_norm": 2.4375, "grad_norm_var": 0.0117095947265625, "learning_rate": 0.0001, "loss": 4.5923, "loss/crossentropy": 1.9982805848121643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2618003487586975, "step": 3850 }, { "epoch": 0.07704, "grad_norm": 2.59375, "grad_norm_var": 0.0127105712890625, "learning_rate": 0.0001, "loss": 4.7704, "loss/crossentropy": 2.065816104412079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2571987137198448, "step": 3852 }, { "epoch": 0.07708, "grad_norm": 2.46875, "grad_norm_var": 0.0100982666015625, "learning_rate": 0.0001, "loss": 4.7493, "loss/crossentropy": 1.933334231376648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24824900180101395, "step": 3854 }, { "epoch": 0.07712, "grad_norm": 2.28125, "grad_norm_var": 0.010514322916666667, "learning_rate": 0.0001, "loss": 4.5805, "loss/crossentropy": 1.9197405576705933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2559673935174942, "step": 3856 }, { "epoch": 0.07716, "grad_norm": 2.3125, "grad_norm_var": 0.010252888997395833, "learning_rate": 0.0001, "loss": 4.429, "loss/crossentropy": 2.307250142097473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2613677680492401, "step": 3858 }, { "epoch": 0.0772, "grad_norm": 2.28125, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 4.4494, "loss/crossentropy": 2.1120635271072388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24830741435289383, "step": 3860 }, { "epoch": 0.07724, "grad_norm": 2.484375, "grad_norm_var": 0.039013671875, "learning_rate": 0.0001, "loss": 4.8585, "loss/crossentropy": 2.404169201850891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2960711419582367, "step": 3862 }, { "epoch": 0.07728, "grad_norm": 2.421875, "grad_norm_var": 0.038899739583333336, "learning_rate": 0.0001, "loss": 5.0257, "loss/crossentropy": 2.2490307688713074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2569812461733818, "step": 3864 }, { "epoch": 0.07732, "grad_norm": 2.34375, "grad_norm_var": 0.0386871337890625, "learning_rate": 0.0001, "loss": 4.9086, "loss/crossentropy": 2.0773178339004517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505042105913162, "step": 3866 }, { "epoch": 0.07736, "grad_norm": 2.40625, "grad_norm_var": 0.038899739583333336, "learning_rate": 0.0001, "loss": 4.4621, "loss/crossentropy": 1.83626389503479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22876176983118057, "step": 3868 }, { "epoch": 0.0774, "grad_norm": 2.34375, "grad_norm_var": 0.03911031087239583, "learning_rate": 0.0001, "loss": 4.5298, "loss/crossentropy": 1.8159971833229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22787011414766312, "step": 3870 }, { "epoch": 0.07744, "grad_norm": 2.453125, "grad_norm_var": 0.0378326416015625, "learning_rate": 0.0001, "loss": 4.5995, "loss/crossentropy": 2.0361026525497437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23802263289690018, "step": 3872 }, { "epoch": 0.07748, "grad_norm": 2.703125, "grad_norm_var": 0.04433492024739583, "learning_rate": 0.0001, "loss": 4.9506, "loss/crossentropy": 2.2464375495910645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26598919928073883, "step": 3874 }, { "epoch": 0.07752, "grad_norm": 2.453125, "grad_norm_var": 0.0408355712890625, "learning_rate": 0.0001, "loss": 4.9416, "loss/crossentropy": 2.163287401199341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29068198800086975, "step": 3876 }, { "epoch": 0.07756, "grad_norm": 2.5, "grad_norm_var": 0.22014567057291667, "learning_rate": 0.0001, "loss": 4.9478, "loss/crossentropy": 2.1638875007629395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2609352171421051, "step": 3878 }, { "epoch": 0.0776, "grad_norm": 2.703125, "grad_norm_var": 0.21923421223958334, "learning_rate": 0.0001, "loss": 4.6247, "loss/crossentropy": 1.9216270446777344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25062238425016403, "step": 3880 }, { "epoch": 0.07764, "grad_norm": 2.828125, "grad_norm_var": 0.21585184733072918, "learning_rate": 0.0001, "loss": 5.0786, "loss/crossentropy": 2.036958694458008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2897229939699173, "step": 3882 }, { "epoch": 0.07768, "grad_norm": 2.5625, "grad_norm_var": 0.2094635009765625, "learning_rate": 0.0001, "loss": 4.6062, "loss/crossentropy": 2.1493492126464844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.258526012301445, "step": 3884 }, { "epoch": 0.07772, "grad_norm": 2.484375, "grad_norm_var": 0.20414937337239583, "learning_rate": 0.0001, "loss": 4.468, "loss/crossentropy": 2.0496288537979126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2386137992143631, "step": 3886 }, { "epoch": 0.07776, "grad_norm": 2.40625, "grad_norm_var": 0.20829671223958332, "learning_rate": 0.0001, "loss": 4.2402, "loss/crossentropy": 1.5763422846794128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2227308303117752, "step": 3888 }, { "epoch": 0.0778, "grad_norm": 2.640625, "grad_norm_var": 0.20852864583333333, "learning_rate": 0.0001, "loss": 5.0582, "loss/crossentropy": 2.5032416582107544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3047761619091034, "step": 3890 }, { "epoch": 0.07784, "grad_norm": 2.375, "grad_norm_var": 0.21199544270833334, "learning_rate": 0.0001, "loss": 4.7694, "loss/crossentropy": 2.3609601259231567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27848154306411743, "step": 3892 }, { "epoch": 0.07788, "grad_norm": 2.328125, "grad_norm_var": 0.0202056884765625, "learning_rate": 0.0001, "loss": 4.776, "loss/crossentropy": 2.188078999519348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2570301741361618, "step": 3894 }, { "epoch": 0.07792, "grad_norm": 2.546875, "grad_norm_var": 0.018733723958333334, "learning_rate": 0.0001, "loss": 4.8374, "loss/crossentropy": 1.9860637784004211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25376833230257034, "step": 3896 }, { "epoch": 0.07796, "grad_norm": 2.234375, "grad_norm_var": 0.010399373372395833, "learning_rate": 0.0001, "loss": 4.404, "loss/crossentropy": 2.0886037945747375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24766233563423157, "step": 3898 }, { "epoch": 0.078, "grad_norm": 2.3125, "grad_norm_var": 0.009845987955729166, "learning_rate": 0.0001, "loss": 4.6833, "loss/crossentropy": 2.373010039329529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27916407585144043, "step": 3900 }, { "epoch": 0.07804, "grad_norm": 2.359375, "grad_norm_var": 0.009837849934895834, "learning_rate": 0.0001, "loss": 4.6295, "loss/crossentropy": 1.6733890771865845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211885154247284, "step": 3902 }, { "epoch": 0.07808, "grad_norm": 2.46875, "grad_norm_var": 0.010184733072916667, "learning_rate": 0.0001, "loss": 4.6588, "loss/crossentropy": 2.0506675243377686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27091431617736816, "step": 3904 }, { "epoch": 0.07812, "grad_norm": 2.28125, "grad_norm_var": 0.008784993489583334, "learning_rate": 0.0001, "loss": 4.712, "loss/crossentropy": 2.3200724124908447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26983049511909485, "step": 3906 }, { "epoch": 0.07816, "grad_norm": 2.765625, "grad_norm_var": 0.0164459228515625, "learning_rate": 0.0001, "loss": 4.7171, "loss/crossentropy": 1.928814709186554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2423655241727829, "step": 3908 }, { "epoch": 0.0782, "grad_norm": 2.40625, "grad_norm_var": 0.016389973958333335, "learning_rate": 0.0001, "loss": 4.6944, "loss/crossentropy": 2.007555842399597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2444767728447914, "step": 3910 }, { "epoch": 0.07824, "grad_norm": 2.875, "grad_norm_var": 0.027469889322916666, "learning_rate": 0.0001, "loss": 4.7955, "loss/crossentropy": 2.2054057121276855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2849784791469574, "step": 3912 }, { "epoch": 0.07828, "grad_norm": 2.53125, "grad_norm_var": 0.03986002604166667, "learning_rate": 0.0001, "loss": 4.6885, "loss/crossentropy": 2.331532597541809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2820790112018585, "step": 3914 }, { "epoch": 0.07832, "grad_norm": 2.40625, "grad_norm_var": 0.036519368489583336, "learning_rate": 0.0001, "loss": 4.6967, "loss/crossentropy": 2.142041563987732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25799722969532013, "step": 3916 }, { "epoch": 0.07836, "grad_norm": 2.625, "grad_norm_var": 0.0394683837890625, "learning_rate": 0.0001, "loss": 4.5415, "loss/crossentropy": 2.010735809803009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24202881753444672, "step": 3918 }, { "epoch": 0.0784, "grad_norm": 2.4375, "grad_norm_var": 0.03970947265625, "learning_rate": 0.0001, "loss": 5.037, "loss/crossentropy": 2.382808804512024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27038049697875977, "step": 3920 }, { "epoch": 0.07844, "grad_norm": 2.375, "grad_norm_var": 0.0372955322265625, "learning_rate": 0.0001, "loss": 4.7708, "loss/crossentropy": 2.099658191204071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557126358151436, "step": 3922 }, { "epoch": 0.07848, "grad_norm": 2.375, "grad_norm_var": 0.03968098958333333, "learning_rate": 0.0001, "loss": 4.3775, "loss/crossentropy": 1.7840275764465332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22102414071559906, "step": 3924 }, { "epoch": 0.07852, "grad_norm": 2.5, "grad_norm_var": 0.039159138997395836, "learning_rate": 0.0001, "loss": 4.6637, "loss/crossentropy": 1.8730725049972534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23219943791627884, "step": 3926 }, { "epoch": 0.07856, "grad_norm": 2.203125, "grad_norm_var": 0.034520467122395836, "learning_rate": 0.0001, "loss": 4.4432, "loss/crossentropy": 1.9218623638153076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23333143442869186, "step": 3928 }, { "epoch": 0.0786, "grad_norm": 2.46875, "grad_norm_var": 0.0152740478515625, "learning_rate": 0.0001, "loss": 4.8944, "loss/crossentropy": 1.9885727763175964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2596626430749893, "step": 3930 }, { "epoch": 0.07864, "grad_norm": 2.390625, "grad_norm_var": 0.014989217122395834, "learning_rate": 0.0001, "loss": 4.9358, "loss/crossentropy": 2.397018015384674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2790713906288147, "step": 3932 }, { "epoch": 0.07868, "grad_norm": 2.1875, "grad_norm_var": 0.014383951822916666, "learning_rate": 0.0001, "loss": 4.3062, "loss/crossentropy": 1.7345170378684998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23668452352285385, "step": 3934 }, { "epoch": 0.07872, "grad_norm": 2.25, "grad_norm_var": 0.013309733072916666, "learning_rate": 0.0001, "loss": 4.6173, "loss/crossentropy": 1.8630162477493286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2371346428990364, "step": 3936 }, { "epoch": 0.07876, "grad_norm": 2.40625, "grad_norm_var": 0.010367838541666667, "learning_rate": 0.0001, "loss": 4.5774, "loss/crossentropy": 2.0738128423690796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24340355396270752, "step": 3938 }, { "epoch": 0.0788, "grad_norm": 2.40625, "grad_norm_var": 0.010835774739583333, "learning_rate": 0.0001, "loss": 5.0027, "loss/crossentropy": 2.2932467460632324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527567446231842, "step": 3940 }, { "epoch": 0.07884, "grad_norm": 2.46875, "grad_norm_var": 0.009956868489583333, "learning_rate": 0.0001, "loss": 4.6785, "loss/crossentropy": 2.0000113248825073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23461253196001053, "step": 3942 }, { "epoch": 0.07888, "grad_norm": 2.296875, "grad_norm_var": 0.008430989583333333, "learning_rate": 0.0001, "loss": 4.7307, "loss/crossentropy": 2.0753955841064453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2535083442926407, "step": 3944 }, { "epoch": 0.07892, "grad_norm": 2.40625, "grad_norm_var": 0.008153279622395834, "learning_rate": 0.0001, "loss": 4.6906, "loss/crossentropy": 2.167261242866516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24734552949666977, "step": 3946 }, { "epoch": 0.07896, "grad_norm": 2.578125, "grad_norm_var": 0.0108795166015625, "learning_rate": 0.0001, "loss": 4.7347, "loss/crossentropy": 1.9755831956863403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24861737340688705, "step": 3948 }, { "epoch": 0.079, "grad_norm": 2.484375, "grad_norm_var": 0.009261067708333333, "learning_rate": 0.0001, "loss": 4.6813, "loss/crossentropy": 2.195417881011963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2518697530031204, "step": 3950 }, { "epoch": 0.07904, "grad_norm": 2.828125, "grad_norm_var": 0.01920166015625, "learning_rate": 0.0001, "loss": 5.0172, "loss/crossentropy": 2.5771371126174927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28423887491226196, "step": 3952 }, { "epoch": 0.07908, "grad_norm": 2.375, "grad_norm_var": 0.017870076497395835, "learning_rate": 0.0001, "loss": 4.7071, "loss/crossentropy": 1.683276355266571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2075405865907669, "step": 3954 }, { "epoch": 0.07912, "grad_norm": 2.515625, "grad_norm_var": 0.018993123372395834, "learning_rate": 0.0001, "loss": 4.7128, "loss/crossentropy": 2.2983756065368652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28010235726833344, "step": 3956 }, { "epoch": 0.07916, "grad_norm": 2.421875, "grad_norm_var": 0.020361328125, "learning_rate": 0.0001, "loss": 4.7896, "loss/crossentropy": 2.3263272047042847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28625819087028503, "step": 3958 }, { "epoch": 0.0792, "grad_norm": 2.40625, "grad_norm_var": 0.0204742431640625, "learning_rate": 0.0001, "loss": 4.5201, "loss/crossentropy": 1.9820871353149414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2594129145145416, "step": 3960 }, { "epoch": 0.07924, "grad_norm": 2.171875, "grad_norm_var": 0.025634765625, "learning_rate": 0.0001, "loss": 4.4754, "loss/crossentropy": 1.8991515636444092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23694587498903275, "step": 3962 }, { "epoch": 0.07928, "grad_norm": 2.921875, "grad_norm_var": 0.03843994140625, "learning_rate": 0.0001, "loss": 4.9865, "loss/crossentropy": 2.485508918762207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2957809865474701, "step": 3964 }, { "epoch": 0.07932, "grad_norm": 2.390625, "grad_norm_var": 0.037262980143229166, "learning_rate": 0.0001, "loss": 4.8871, "loss/crossentropy": 2.156081974506378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27474651485681534, "step": 3966 }, { "epoch": 0.07936, "grad_norm": 2.5625, "grad_norm_var": 0.028544108072916668, "learning_rate": 0.0001, "loss": 4.8694, "loss/crossentropy": 2.0370571613311768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2732074484229088, "step": 3968 }, { "epoch": 0.0794, "grad_norm": 2.421875, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 5.062, "loss/crossentropy": 2.3039989471435547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31054411828517914, "step": 3970 }, { "epoch": 0.07944, "grad_norm": 2.34375, "grad_norm_var": 0.028641764322916666, "learning_rate": 0.0001, "loss": 4.7875, "loss/crossentropy": 2.2280107736587524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2711277902126312, "step": 3972 }, { "epoch": 0.07948, "grad_norm": 2.625, "grad_norm_var": 0.02935791015625, "learning_rate": 0.0001, "loss": 4.8992, "loss/crossentropy": 2.0609869956970215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24142058193683624, "step": 3974 }, { "epoch": 0.07952, "grad_norm": 2.296875, "grad_norm_var": 0.03277587890625, "learning_rate": 0.0001, "loss": 4.1391, "loss/crossentropy": 2.0541876554489136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24537865817546844, "step": 3976 }, { "epoch": 0.07956, "grad_norm": 2.578125, "grad_norm_var": 0.6102701822916666, "learning_rate": 0.0001, "loss": 4.9591, "loss/crossentropy": 2.344236969947815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2770863175392151, "step": 3978 }, { "epoch": 0.0796, "grad_norm": 2.625, "grad_norm_var": 0.65465087890625, "learning_rate": 0.0001, "loss": 4.7232, "loss/crossentropy": 1.7899338603019714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23951984196901321, "step": 3980 }, { "epoch": 0.07964, "grad_norm": 2.328125, "grad_norm_var": 0.6591471354166667, "learning_rate": 0.0001, "loss": 4.6771, "loss/crossentropy": 2.3253976106643677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2691802680492401, "step": 3982 }, { "epoch": 0.07968, "grad_norm": 2.453125, "grad_norm_var": 0.65885009765625, "learning_rate": 0.0001, "loss": 4.8223, "loss/crossentropy": 2.14141583442688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2603989467024803, "step": 3984 }, { "epoch": 0.07972, "grad_norm": 2.53125, "grad_norm_var": 0.6512196858723959, "learning_rate": 0.0001, "loss": 4.9059, "loss/crossentropy": 2.262465476989746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.291474387049675, "step": 3986 }, { "epoch": 0.07976, "grad_norm": 2.328125, "grad_norm_var": 0.66064453125, "learning_rate": 0.0001, "loss": 4.5626, "loss/crossentropy": 2.1835561990737915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23615659773349762, "step": 3988 }, { "epoch": 0.0798, "grad_norm": 2.359375, "grad_norm_var": 0.6716145833333333, "learning_rate": 0.0001, "loss": 4.8482, "loss/crossentropy": 2.020140767097473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2593151703476906, "step": 3990 }, { "epoch": 0.07984, "grad_norm": 2.59375, "grad_norm_var": 0.6518513997395833, "learning_rate": 0.0001, "loss": 4.7986, "loss/crossentropy": 2.277661681175232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2721617519855499, "step": 3992 }, { "epoch": 0.07988, "grad_norm": 2.34375, "grad_norm_var": 0.09846089680989584, "learning_rate": 0.0001, "loss": 4.5113, "loss/crossentropy": 2.1883193254470825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2610222101211548, "step": 3994 }, { "epoch": 0.07992, "grad_norm": 2.421875, "grad_norm_var": 0.01754150390625, "learning_rate": 0.0001, "loss": 4.5987, "loss/crossentropy": 2.152850031852722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27418845891952515, "step": 3996 }, { "epoch": 0.07996, "grad_norm": 2.546875, "grad_norm_var": 0.017513020833333334, "learning_rate": 0.0001, "loss": 4.8214, "loss/crossentropy": 2.3313716650009155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27529503405094147, "step": 3998 }, { "epoch": 0.08, "grad_norm": 2.640625, "grad_norm_var": 0.020319620768229168, "learning_rate": 0.0001, "loss": 5.086, "loss/crossentropy": 2.250498414039612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26368021965026855, "step": 4000 }, { "epoch": 0.08004, "grad_norm": 2.4375, "grad_norm_var": 0.011188761393229166, "learning_rate": 0.0001, "loss": 4.8005, "loss/crossentropy": 2.322459101676941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2868216335773468, "step": 4002 }, { "epoch": 0.08008, "grad_norm": 2.40625, "grad_norm_var": 0.0142730712890625, "learning_rate": 0.0001, "loss": 4.8693, "loss/crossentropy": 1.9340506792068481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316983863711357, "step": 4004 }, { "epoch": 0.08012, "grad_norm": 2.359375, "grad_norm_var": 0.0150054931640625, "learning_rate": 0.0001, "loss": 4.7395, "loss/crossentropy": 1.8635645508766174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22694773972034454, "step": 4006 }, { "epoch": 0.08016, "grad_norm": 10.375, "grad_norm_var": 3.9615631103515625, "learning_rate": 0.0001, "loss": 4.8916, "loss/crossentropy": 1.9252317547798157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24893560260534286, "step": 4008 }, { "epoch": 0.0802, "grad_norm": 2.671875, "grad_norm_var": 3.9093424479166665, "learning_rate": 0.0001, "loss": 5.2636, "loss/crossentropy": 2.1964328289031982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2567693591117859, "step": 4010 }, { "epoch": 0.08024, "grad_norm": 2.671875, "grad_norm_var": 3.8960113525390625, "learning_rate": 0.0001, "loss": 4.9054, "loss/crossentropy": 2.296012043952942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27360107749700546, "step": 4012 }, { "epoch": 0.08028, "grad_norm": 2.453125, "grad_norm_var": 3.9009724934895833, "learning_rate": 0.0001, "loss": 4.8894, "loss/crossentropy": 2.360015869140625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27541905641555786, "step": 4014 }, { "epoch": 0.08032, "grad_norm": 2.390625, "grad_norm_var": 3.906591796875, "learning_rate": 0.0001, "loss": 4.8865, "loss/crossentropy": 2.36370050907135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27881547808647156, "step": 4016 }, { "epoch": 0.08036, "grad_norm": 2.25, "grad_norm_var": 3.9068593343098956, "learning_rate": 0.0001, "loss": 4.6461, "loss/crossentropy": 1.8704780340194702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2512069493532181, "step": 4018 }, { "epoch": 0.0804, "grad_norm": 2.4375, "grad_norm_var": 3.9156483968098956, "learning_rate": 0.0001, "loss": 4.733, "loss/crossentropy": 2.1989234685897827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2755106985569, "step": 4020 }, { "epoch": 0.08044, "grad_norm": 2.21875, "grad_norm_var": 3.9420237223307293, "learning_rate": 0.0001, "loss": 4.3471, "loss/crossentropy": 1.9905433058738708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2579897418618202, "step": 4022 }, { "epoch": 0.08048, "grad_norm": 2.515625, "grad_norm_var": 0.08837788899739583, "learning_rate": 0.0001, "loss": 4.9025, "loss/crossentropy": 2.270000696182251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2693728804588318, "step": 4024 }, { "epoch": 0.08052, "grad_norm": 2.375, "grad_norm_var": 0.08504130045572916, "learning_rate": 0.0001, "loss": 4.7569, "loss/crossentropy": 2.178301692008972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2617819905281067, "step": 4026 }, { "epoch": 0.08056, "grad_norm": 2.25, "grad_norm_var": 0.08346354166666667, "learning_rate": 0.0001, "loss": 4.687, "loss/crossentropy": 2.518654465675354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29282887279987335, "step": 4028 }, { "epoch": 0.0806, "grad_norm": 2.25, "grad_norm_var": 0.08859049479166667, "learning_rate": 0.0001, "loss": 4.4825, "loss/crossentropy": 1.9181422591209412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23210199177265167, "step": 4030 }, { "epoch": 0.08064, "grad_norm": 2.359375, "grad_norm_var": 0.08816630045572917, "learning_rate": 0.0001, "loss": 4.6407, "loss/crossentropy": 2.343222141265869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28908614814281464, "step": 4032 }, { "epoch": 0.08068, "grad_norm": 2.390625, "grad_norm_var": 0.0302886962890625, "learning_rate": 0.0001, "loss": 4.6879, "loss/crossentropy": 2.0816845893859863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2571762502193451, "step": 4034 }, { "epoch": 0.08072, "grad_norm": 2.296875, "grad_norm_var": 0.029878743489583335, "learning_rate": 0.0001, "loss": 4.3324, "loss/crossentropy": 1.9679544568061829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316160574555397, "step": 4036 }, { "epoch": 0.08076, "grad_norm": 2.328125, "grad_norm_var": 0.029195149739583332, "learning_rate": 0.0001, "loss": 4.7335, "loss/crossentropy": 2.1553682684898376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25725623965263367, "step": 4038 }, { "epoch": 0.0808, "grad_norm": 2.421875, "grad_norm_var": 0.004964192708333333, "learning_rate": 0.0001, "loss": 4.6892, "loss/crossentropy": 2.0269790291786194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26298412680625916, "step": 4040 }, { "epoch": 0.08084, "grad_norm": 2.484375, "grad_norm_var": 0.0060943603515625, "learning_rate": 0.0001, "loss": 4.6686, "loss/crossentropy": 1.984773874282837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24288517236709595, "step": 4042 }, { "epoch": 0.08088, "grad_norm": 2.3125, "grad_norm_var": 0.0051910400390625, "learning_rate": 0.0001, "loss": 4.9282, "loss/crossentropy": 2.178356111049652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2684163451194763, "step": 4044 }, { "epoch": 0.08092, "grad_norm": 2.46875, "grad_norm_var": 0.0053670247395833336, "learning_rate": 0.0001, "loss": 4.8191, "loss/crossentropy": 2.235984683036804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26377667486667633, "step": 4046 }, { "epoch": 0.08096, "grad_norm": 2.34375, "grad_norm_var": 0.005826822916666667, "learning_rate": 0.0001, "loss": 4.7026, "loss/crossentropy": 2.085321545600891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23403701931238174, "step": 4048 }, { "epoch": 0.081, "grad_norm": 2.3125, "grad_norm_var": 0.0052642822265625, "learning_rate": 0.0001, "loss": 4.9932, "loss/crossentropy": 2.419228672981262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27236658334732056, "step": 4050 }, { "epoch": 0.08104, "grad_norm": 2.421875, "grad_norm_var": 0.0054972330729166664, "learning_rate": 0.0001, "loss": 4.6105, "loss/crossentropy": 2.153620958328247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28071053326129913, "step": 4052 }, { "epoch": 0.08108, "grad_norm": 2.71875, "grad_norm_var": 0.012743123372395833, "learning_rate": 0.0001, "loss": 4.8775, "loss/crossentropy": 2.1466477513313293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27491800487041473, "step": 4054 }, { "epoch": 0.08112, "grad_norm": 2.265625, "grad_norm_var": 0.013932291666666667, "learning_rate": 0.0001, "loss": 4.3707, "loss/crossentropy": 2.2020710706710815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25149518996477127, "step": 4056 }, { "epoch": 0.08116, "grad_norm": 2.34375, "grad_norm_var": 0.013623046875, "learning_rate": 0.0001, "loss": 4.8458, "loss/crossentropy": 2.264205574989319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27612583339214325, "step": 4058 }, { "epoch": 0.0812, "grad_norm": 2.4375, "grad_norm_var": 0.016434733072916666, "learning_rate": 0.0001, "loss": 4.8644, "loss/crossentropy": 2.269905209541321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2692303955554962, "step": 4060 }, { "epoch": 0.08124, "grad_norm": 2.34375, "grad_norm_var": 0.018505859375, "learning_rate": 0.0001, "loss": 4.5057, "loss/crossentropy": 1.920631766319275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22550494968891144, "step": 4062 }, { "epoch": 0.08128, "grad_norm": 2.53125, "grad_norm_var": 0.019749959309895832, "learning_rate": 0.0001, "loss": 5.0796, "loss/crossentropy": 2.307617664337158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22719035297632217, "step": 4064 }, { "epoch": 0.08132, "grad_norm": 2.375, "grad_norm_var": 0.022102864583333333, "learning_rate": 0.0001, "loss": 4.6167, "loss/crossentropy": 2.113444685935974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.240354023873806, "step": 4066 }, { "epoch": 0.08136, "grad_norm": 2.375, "grad_norm_var": 0.02232666015625, "learning_rate": 0.0001, "loss": 4.9152, "loss/crossentropy": 2.4516230821609497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27590544521808624, "step": 4068 }, { "epoch": 0.0814, "grad_norm": 2.34375, "grad_norm_var": 0.08772379557291667, "learning_rate": 0.0001, "loss": 4.5976, "loss/crossentropy": 1.8287339806556702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22503511607646942, "step": 4070 }, { "epoch": 0.08144, "grad_norm": 2.5, "grad_norm_var": 0.08479715983072916, "learning_rate": 0.0001, "loss": 5.1623, "loss/crossentropy": 2.3468997478485107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2773839682340622, "step": 4072 }, { "epoch": 0.08148, "grad_norm": 2.296875, "grad_norm_var": 0.08782145182291666, "learning_rate": 0.0001, "loss": 4.5413, "loss/crossentropy": 2.1307512521743774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24247504770755768, "step": 4074 }, { "epoch": 0.08152, "grad_norm": 2.21875, "grad_norm_var": 0.08982645670572917, "learning_rate": 0.0001, "loss": 4.7447, "loss/crossentropy": 2.248755097389221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2696859538555145, "step": 4076 }, { "epoch": 0.08156, "grad_norm": 2.265625, "grad_norm_var": 0.09045817057291666, "learning_rate": 0.0001, "loss": 4.2948, "loss/crossentropy": 2.0233980417251587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2370016872882843, "step": 4078 }, { "epoch": 0.0816, "grad_norm": 2.234375, "grad_norm_var": 0.0923828125, "learning_rate": 0.0001, "loss": 4.432, "loss/crossentropy": 1.9536627531051636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23242933303117752, "step": 4080 }, { "epoch": 0.08164, "grad_norm": 2.375, "grad_norm_var": 0.0889801025390625, "learning_rate": 0.0001, "loss": 4.5037, "loss/crossentropy": 1.9631904363632202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2337196245789528, "step": 4082 }, { "epoch": 0.08168, "grad_norm": 2.546875, "grad_norm_var": 0.08935139973958334, "learning_rate": 0.0001, "loss": 4.7406, "loss/crossentropy": 2.193789482116699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2687358558177948, "step": 4084 }, { "epoch": 0.08172, "grad_norm": 2.40625, "grad_norm_var": 0.020210774739583333, "learning_rate": 0.0001, "loss": 4.6454, "loss/crossentropy": 2.308240056037903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2679348289966583, "step": 4086 }, { "epoch": 0.08176, "grad_norm": 2.359375, "grad_norm_var": 0.021903483072916667, "learning_rate": 0.0001, "loss": 4.9126, "loss/crossentropy": 2.343047261238098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3056950569152832, "step": 4088 }, { "epoch": 0.0818, "grad_norm": 2.234375, "grad_norm_var": 0.022526041666666666, "learning_rate": 0.0001, "loss": 4.7315, "loss/crossentropy": 1.9583085179328918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23911744356155396, "step": 4090 }, { "epoch": 0.08184, "grad_norm": 2.296875, "grad_norm_var": 0.021198527018229166, "learning_rate": 0.0001, "loss": 4.6559, "loss/crossentropy": 2.341569185256958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26053962111473083, "step": 4092 }, { "epoch": 0.08188, "grad_norm": 2.921875, "grad_norm_var": 0.034601847330729164, "learning_rate": 0.0001, "loss": 4.5773, "loss/crossentropy": 2.068669080734253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25753986835479736, "step": 4094 }, { "epoch": 0.08192, "grad_norm": 2.625, "grad_norm_var": 0.04309488932291667, "learning_rate": 0.0001, "loss": 5.0253, "loss/crossentropy": 2.1461241841316223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2628704681992531, "step": 4096 }, { "epoch": 0.08196, "grad_norm": 2.3125, "grad_norm_var": 0.04810791015625, "learning_rate": 0.0001, "loss": 4.5587, "loss/crossentropy": 2.0718055963516235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24923217296600342, "step": 4098 }, { "epoch": 0.082, "grad_norm": 2.359375, "grad_norm_var": 0.0503082275390625, "learning_rate": 0.0001, "loss": 4.379, "loss/crossentropy": 1.9812004566192627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316955253481865, "step": 4100 }, { "epoch": 0.08204, "grad_norm": 2.3125, "grad_norm_var": 0.0465240478515625, "learning_rate": 0.0001, "loss": 4.7909, "loss/crossentropy": 2.2669100761413574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2806926518678665, "step": 4102 }, { "epoch": 0.08208, "grad_norm": 2.375, "grad_norm_var": 0.043196614583333334, "learning_rate": 0.0001, "loss": 4.7502, "loss/crossentropy": 2.0620261430740356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2610047310590744, "step": 4104 }, { "epoch": 0.08212, "grad_norm": 3.046875, "grad_norm_var": 0.06297098795572917, "learning_rate": 0.0001, "loss": 4.6672, "loss/crossentropy": 2.249971866607666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2783341556787491, "step": 4106 }, { "epoch": 0.08216, "grad_norm": 2.328125, "grad_norm_var": 0.06320699055989583, "learning_rate": 0.0001, "loss": 4.834, "loss/crossentropy": 2.1064823865890503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27534550428390503, "step": 4108 }, { "epoch": 0.0822, "grad_norm": 2.28125, "grad_norm_var": 0.05406901041666667, "learning_rate": 0.0001, "loss": 4.3944, "loss/crossentropy": 1.886509656906128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2315160632133484, "step": 4110 }, { "epoch": 0.08224, "grad_norm": 2.40625, "grad_norm_var": 0.039290364583333334, "learning_rate": 0.0001, "loss": 4.2969, "loss/crossentropy": 1.6429635286331177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20593100041151047, "step": 4112 }, { "epoch": 0.08228, "grad_norm": 2.40625, "grad_norm_var": 0.0370025634765625, "learning_rate": 0.0001, "loss": 4.4581, "loss/crossentropy": 2.3236618041992188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2674623131752014, "step": 4114 }, { "epoch": 0.08232, "grad_norm": 2.578125, "grad_norm_var": 0.03658447265625, "learning_rate": 0.0001, "loss": 4.9734, "loss/crossentropy": 2.1479567885398865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25003983825445175, "step": 4116 }, { "epoch": 0.08236, "grad_norm": 2.578125, "grad_norm_var": 0.03611551920572917, "learning_rate": 0.0001, "loss": 5.0477, "loss/crossentropy": 2.140569031238556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24322029948234558, "step": 4118 }, { "epoch": 0.0824, "grad_norm": 2.328125, "grad_norm_var": 0.03762613932291667, "learning_rate": 0.0001, "loss": 4.6061, "loss/crossentropy": 2.126375436782837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27629759907722473, "step": 4120 }, { "epoch": 0.08244, "grad_norm": 2.28125, "grad_norm_var": 0.015843709309895832, "learning_rate": 0.0001, "loss": 4.9143, "loss/crossentropy": 2.3699214458465576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2692546397447586, "step": 4122 }, { "epoch": 0.08248, "grad_norm": 2.296875, "grad_norm_var": 0.010904947916666666, "learning_rate": 0.0001, "loss": 4.5672, "loss/crossentropy": 2.013331353664398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24442073702812195, "step": 4124 }, { "epoch": 0.08252, "grad_norm": 2.359375, "grad_norm_var": 0.010152180989583334, "learning_rate": 0.0001, "loss": 4.4841, "loss/crossentropy": 2.1869460344314575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.250150203704834, "step": 4126 }, { "epoch": 0.08256, "grad_norm": 2.203125, "grad_norm_var": 0.0127838134765625, "learning_rate": 0.0001, "loss": 4.4564, "loss/crossentropy": 2.2725884914398193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2810261696577072, "step": 4128 }, { "epoch": 0.0826, "grad_norm": 2.328125, "grad_norm_var": 0.013688151041666667, "learning_rate": 0.0001, "loss": 4.7311, "loss/crossentropy": 1.9190022945404053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2693525403738022, "step": 4130 }, { "epoch": 0.08264, "grad_norm": 2.65625, "grad_norm_var": 0.016112263997395834, "learning_rate": 0.0001, "loss": 4.7967, "loss/crossentropy": 2.5477795600891113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27040669322013855, "step": 4132 }, { "epoch": 0.08268, "grad_norm": 2.40625, "grad_norm_var": 0.0113677978515625, "learning_rate": 0.0001, "loss": 4.7617, "loss/crossentropy": 2.231198728084564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28101250529289246, "step": 4134 }, { "epoch": 0.08272, "grad_norm": 2.296875, "grad_norm_var": 0.011693318684895834, "learning_rate": 0.0001, "loss": 4.6334, "loss/crossentropy": 2.17776882648468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24743208289146423, "step": 4136 }, { "epoch": 0.08276, "grad_norm": 2.46875, "grad_norm_var": 0.011799112955729166, "learning_rate": 0.0001, "loss": 5.0233, "loss/crossentropy": 2.418373703956604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2954525351524353, "step": 4138 }, { "epoch": 0.0828, "grad_norm": 2.3125, "grad_norm_var": 0.010969034830729167, "learning_rate": 0.0001, "loss": 4.5564, "loss/crossentropy": 2.054605543613434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23863950371742249, "step": 4140 }, { "epoch": 0.08284, "grad_norm": 2.5, "grad_norm_var": 0.011872355143229167, "learning_rate": 0.0001, "loss": 4.8983, "loss/crossentropy": 2.054013967514038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29851172864437103, "step": 4142 }, { "epoch": 0.08288, "grad_norm": 2.46875, "grad_norm_var": 0.008610026041666666, "learning_rate": 0.0001, "loss": 4.7425, "loss/crossentropy": 2.193961024284363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2563214898109436, "step": 4144 }, { "epoch": 0.08292, "grad_norm": 2.40625, "grad_norm_var": 0.008382161458333334, "learning_rate": 0.0001, "loss": 4.7995, "loss/crossentropy": 2.460008382797241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26705074310302734, "step": 4146 }, { "epoch": 0.08296, "grad_norm": 2.3125, "grad_norm_var": 0.005060831705729167, "learning_rate": 0.0001, "loss": 4.894, "loss/crossentropy": 2.508321523666382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26217761635780334, "step": 4148 }, { "epoch": 0.083, "grad_norm": 2.328125, "grad_norm_var": 0.0065582275390625, "learning_rate": 0.0001, "loss": 4.6103, "loss/crossentropy": 1.8445284366607666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23396535962820053, "step": 4150 }, { "epoch": 0.08304, "grad_norm": 2.375, "grad_norm_var": 0.005952962239583333, "learning_rate": 0.0001, "loss": 4.8048, "loss/crossentropy": 2.433600902557373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2825329154729843, "step": 4152 }, { "epoch": 0.08308, "grad_norm": 2.421875, "grad_norm_var": 0.0053212483723958336, "learning_rate": 0.0001, "loss": 4.8632, "loss/crossentropy": 2.386221170425415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2941686511039734, "step": 4154 }, { "epoch": 0.08312, "grad_norm": 2.421875, "grad_norm_var": 0.0052154541015625, "learning_rate": 0.0001, "loss": 4.7486, "loss/crossentropy": 1.9578949809074402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24851053953170776, "step": 4156 }, { "epoch": 0.08316, "grad_norm": 2.3125, "grad_norm_var": 0.004150390625, "learning_rate": 0.0001, "loss": 4.6443, "loss/crossentropy": 2.0034408569335938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25524984300136566, "step": 4158 }, { "epoch": 0.0832, "grad_norm": 2.28125, "grad_norm_var": 0.0069488525390625, "learning_rate": 0.0001, "loss": 4.6856, "loss/crossentropy": 2.3342589139938354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31023962795734406, "step": 4160 }, { "epoch": 0.08324, "grad_norm": 2.46875, "grad_norm_var": 0.042867024739583336, "learning_rate": 0.0001, "loss": 4.6762, "loss/crossentropy": 2.3941839933395386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2641746401786804, "step": 4162 }, { "epoch": 0.08328, "grad_norm": 2.453125, "grad_norm_var": 0.04168294270833333, "learning_rate": 0.0001, "loss": 4.7149, "loss/crossentropy": 2.371219038963318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24197939038276672, "step": 4164 }, { "epoch": 0.08332, "grad_norm": 2.734375, "grad_norm_var": 0.04442952473958333, "learning_rate": 0.0001, "loss": 4.7949, "loss/crossentropy": 2.133378028869629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25449906289577484, "step": 4166 }, { "epoch": 0.08336, "grad_norm": 2.4375, "grad_norm_var": 0.04496968587239583, "learning_rate": 0.0001, "loss": 4.5974, "loss/crossentropy": 1.7460412979125977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157151699066162, "step": 4168 }, { "epoch": 0.0834, "grad_norm": 2.609375, "grad_norm_var": 0.046873982747395834, "learning_rate": 0.0001, "loss": 4.7234, "loss/crossentropy": 2.215083122253418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2823774367570877, "step": 4170 }, { "epoch": 0.08344, "grad_norm": 2.453125, "grad_norm_var": 0.04820048014322917, "learning_rate": 0.0001, "loss": 4.5189, "loss/crossentropy": 2.0528377890586853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2551003098487854, "step": 4172 }, { "epoch": 0.08348, "grad_norm": 2.359375, "grad_norm_var": 0.05054423014322917, "learning_rate": 0.0001, "loss": 4.4008, "loss/crossentropy": 1.7953855395317078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23185917735099792, "step": 4174 }, { "epoch": 0.08352, "grad_norm": 2.34375, "grad_norm_var": 0.058934529622395836, "learning_rate": 0.0001, "loss": 4.4879, "loss/crossentropy": 2.0794734954833984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23386041820049286, "step": 4176 }, { "epoch": 0.08356, "grad_norm": 2.234375, "grad_norm_var": 0.031086222330729166, "learning_rate": 0.0001, "loss": 4.3802, "loss/crossentropy": 2.1685845851898193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26074862480163574, "step": 4178 }, { "epoch": 0.0836, "grad_norm": 2.484375, "grad_norm_var": 0.031412760416666664, "learning_rate": 0.0001, "loss": 4.5507, "loss/crossentropy": 2.1495825052261353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2689145505428314, "step": 4180 }, { "epoch": 0.08364, "grad_norm": 2.328125, "grad_norm_var": 0.024967447916666666, "learning_rate": 0.0001, "loss": 4.5258, "loss/crossentropy": 2.043331503868103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2498578578233719, "step": 4182 }, { "epoch": 0.08368, "grad_norm": 2.734375, "grad_norm_var": 0.07787984212239583, "learning_rate": 0.0001, "loss": 5.0198, "loss/crossentropy": 2.04026997089386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2873340845108032, "step": 4184 }, { "epoch": 0.08372, "grad_norm": 2.515625, "grad_norm_var": 0.09128316243489583, "learning_rate": 0.0001, "loss": 4.9537, "loss/crossentropy": 2.4653968811035156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2911294251680374, "step": 4186 }, { "epoch": 0.08376, "grad_norm": 2.421875, "grad_norm_var": 0.09215494791666666, "learning_rate": 0.0001, "loss": 4.6589, "loss/crossentropy": 2.2960848808288574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23984040319919586, "step": 4188 }, { "epoch": 0.0838, "grad_norm": 2.515625, "grad_norm_var": 0.09599202473958333, "learning_rate": 0.0001, "loss": 4.4324, "loss/crossentropy": 2.011807084083557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2523125037550926, "step": 4190 }, { "epoch": 0.08384, "grad_norm": 2.421875, "grad_norm_var": 0.0886871337890625, "learning_rate": 0.0001, "loss": 4.8437, "loss/crossentropy": 2.0016889572143555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2291206791996956, "step": 4192 }, { "epoch": 0.08388, "grad_norm": 2.1875, "grad_norm_var": 0.09378255208333333, "learning_rate": 0.0001, "loss": 4.3604, "loss/crossentropy": 1.97197824716568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2595779076218605, "step": 4194 }, { "epoch": 0.08392, "grad_norm": 2.34375, "grad_norm_var": 0.09763895670572917, "learning_rate": 0.0001, "loss": 4.5823, "loss/crossentropy": 2.2910103797912598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24870596826076508, "step": 4196 }, { "epoch": 0.08396, "grad_norm": 2.234375, "grad_norm_var": 0.1001617431640625, "learning_rate": 0.0001, "loss": 4.562, "loss/crossentropy": 2.1453208923339844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25001347810029984, "step": 4198 }, { "epoch": 0.084, "grad_norm": 2.359375, "grad_norm_var": 0.0398834228515625, "learning_rate": 0.0001, "loss": 4.8835, "loss/crossentropy": 2.1935043334960938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26807525753974915, "step": 4200 }, { "epoch": 0.08404, "grad_norm": 2.3125, "grad_norm_var": 0.009765625, "learning_rate": 0.0001, "loss": 4.5912, "loss/crossentropy": 2.039341926574707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460380420088768, "step": 4202 }, { "epoch": 0.08408, "grad_norm": 3.09375, "grad_norm_var": 0.0478179931640625, "learning_rate": 0.0001, "loss": 4.8243, "loss/crossentropy": 2.4660122394561768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3042101263999939, "step": 4204 }, { "epoch": 0.08412, "grad_norm": 2.453125, "grad_norm_var": 0.08088785807291667, "learning_rate": 0.0001, "loss": 4.8635, "loss/crossentropy": 1.9346272349357605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25924334675073624, "step": 4206 }, { "epoch": 0.08416, "grad_norm": 2.1875, "grad_norm_var": 0.08311258951822917, "learning_rate": 0.0001, "loss": 4.5152, "loss/crossentropy": 2.0120063424110413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24327433109283447, "step": 4208 }, { "epoch": 0.0842, "grad_norm": 2.703125, "grad_norm_var": 0.0852935791015625, "learning_rate": 0.0001, "loss": 4.6148, "loss/crossentropy": 2.2359931468963623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25374244898557663, "step": 4210 }, { "epoch": 0.08424, "grad_norm": 2.53125, "grad_norm_var": 0.08185221354166666, "learning_rate": 0.0001, "loss": 4.5751, "loss/crossentropy": 2.0038134455680847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22315364331007004, "step": 4212 }, { "epoch": 0.08428, "grad_norm": 2.296875, "grad_norm_var": 0.08567301432291667, "learning_rate": 0.0001, "loss": 4.721, "loss/crossentropy": 2.2041471004486084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655494213104248, "step": 4214 }, { "epoch": 0.08432, "grad_norm": 2.390625, "grad_norm_var": 0.08399149576822916, "learning_rate": 0.0001, "loss": 4.8091, "loss/crossentropy": 2.344551682472229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2743126451969147, "step": 4216 }, { "epoch": 0.08436, "grad_norm": 2.453125, "grad_norm_var": 0.08025614420572917, "learning_rate": 0.0001, "loss": 4.7162, "loss/crossentropy": 1.9694250226020813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24425261467695236, "step": 4218 }, { "epoch": 0.0844, "grad_norm": 2.40625, "grad_norm_var": 0.05671284993489583, "learning_rate": 0.0001, "loss": 4.8526, "loss/crossentropy": 2.164921760559082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.272469699382782, "step": 4220 }, { "epoch": 0.08444, "grad_norm": 2.484375, "grad_norm_var": 0.02642822265625, "learning_rate": 0.0001, "loss": 4.5513, "loss/crossentropy": 1.944575309753418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23848393559455872, "step": 4222 }, { "epoch": 0.08448, "grad_norm": 2.359375, "grad_norm_var": 0.025386555989583334, "learning_rate": 0.0001, "loss": 4.7416, "loss/crossentropy": 2.278227686882019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2558315545320511, "step": 4224 }, { "epoch": 0.08452, "grad_norm": 2.453125, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 4.7318, "loss/crossentropy": 2.035117268562317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2585935667157173, "step": 4226 }, { "epoch": 0.08456, "grad_norm": 2.234375, "grad_norm_var": 0.020466105143229166, "learning_rate": 0.0001, "loss": 4.5674, "loss/crossentropy": 2.0172035694122314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23832131922245026, "step": 4228 }, { "epoch": 0.0846, "grad_norm": 2.484375, "grad_norm_var": 0.011165364583333334, "learning_rate": 0.0001, "loss": 4.8113, "loss/crossentropy": 2.0574535727500916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23829226195812225, "step": 4230 }, { "epoch": 0.08464, "grad_norm": 2.3125, "grad_norm_var": 0.010773722330729167, "learning_rate": 0.0001, "loss": 4.6776, "loss/crossentropy": 2.5003366470336914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27905476093292236, "step": 4232 }, { "epoch": 0.08468, "grad_norm": 2.578125, "grad_norm_var": 0.012043253580729166, "learning_rate": 0.0001, "loss": 4.9137, "loss/crossentropy": 2.207367777824402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27426937222480774, "step": 4234 }, { "epoch": 0.08472, "grad_norm": 2.1875, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 4.716, "loss/crossentropy": 2.240189790725708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26208513230085373, "step": 4236 }, { "epoch": 0.08476, "grad_norm": 2.3125, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 4.4569, "loss/crossentropy": 2.1357412338256836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24108420312404633, "step": 4238 }, { "epoch": 0.0848, "grad_norm": 2.296875, "grad_norm_var": 0.013004557291666666, "learning_rate": 0.0001, "loss": 4.7249, "loss/crossentropy": 2.1073816418647766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25213342159986496, "step": 4240 }, { "epoch": 0.08484, "grad_norm": 2.40625, "grad_norm_var": 0.013102213541666666, "learning_rate": 0.0001, "loss": 4.9558, "loss/crossentropy": 2.158124566078186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505848854780197, "step": 4242 }, { "epoch": 0.08488, "grad_norm": 2.328125, "grad_norm_var": 0.011555989583333334, "learning_rate": 0.0001, "loss": 4.7743, "loss/crossentropy": 2.253539562225342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2675466388463974, "step": 4244 }, { "epoch": 0.08492, "grad_norm": 2.5, "grad_norm_var": 0.01396484375, "learning_rate": 0.0001, "loss": 4.272, "loss/crossentropy": 1.7170023918151855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20157357305288315, "step": 4246 }, { "epoch": 0.08496, "grad_norm": 2.40625, "grad_norm_var": 0.0188385009765625, "learning_rate": 0.0001, "loss": 4.4479, "loss/crossentropy": 2.082640767097473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2600134015083313, "step": 4248 }, { "epoch": 0.085, "grad_norm": 2.34375, "grad_norm_var": 0.0165679931640625, "learning_rate": 0.0001, "loss": 4.4176, "loss/crossentropy": 2.044301390647888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23220707476139069, "step": 4250 }, { "epoch": 0.08504, "grad_norm": 2.453125, "grad_norm_var": 0.0168121337890625, "learning_rate": 0.0001, "loss": 4.648, "loss/crossentropy": 2.293405532836914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28027066588401794, "step": 4252 }, { "epoch": 0.08508, "grad_norm": 2.265625, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 4.55, "loss/crossentropy": 2.2604206800460815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25545646995306015, "step": 4254 }, { "epoch": 0.08512, "grad_norm": 2.65625, "grad_norm_var": 0.023949178059895833, "learning_rate": 0.0001, "loss": 4.6258, "loss/crossentropy": 2.118361234664917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24799348413944244, "step": 4256 }, { "epoch": 0.08516, "grad_norm": 2.390625, "grad_norm_var": 0.022847493489583332, "learning_rate": 0.0001, "loss": 4.6751, "loss/crossentropy": 1.9369969964027405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24676478654146194, "step": 4258 }, { "epoch": 0.0852, "grad_norm": 2.40625, "grad_norm_var": 0.021728515625, "learning_rate": 0.0001, "loss": 4.5197, "loss/crossentropy": 2.075170874595642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21742676943540573, "step": 4260 }, { "epoch": 0.08524, "grad_norm": 2.296875, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 4.4112, "loss/crossentropy": 2.056099236011505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2490842342376709, "step": 4262 }, { "epoch": 0.08528, "grad_norm": 2.328125, "grad_norm_var": 0.014436848958333333, "learning_rate": 0.0001, "loss": 4.6169, "loss/crossentropy": 2.2279993891716003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24964337795972824, "step": 4264 }, { "epoch": 0.08532, "grad_norm": 2.375, "grad_norm_var": 0.011393229166666666, "learning_rate": 0.0001, "loss": 4.6686, "loss/crossentropy": 2.1645933389663696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24576786905527115, "step": 4266 }, { "epoch": 0.08536, "grad_norm": 2.25, "grad_norm_var": 0.010887654622395833, "learning_rate": 0.0001, "loss": 4.4458, "loss/crossentropy": 1.9033920764923096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23393237590789795, "step": 4268 }, { "epoch": 0.0854, "grad_norm": 2.46875, "grad_norm_var": 0.011865234375, "learning_rate": 0.0001, "loss": 4.4022, "loss/crossentropy": 2.153634190559387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2496839165687561, "step": 4270 }, { "epoch": 0.08544, "grad_norm": 2.28125, "grad_norm_var": 0.0045206705729166664, "learning_rate": 0.0001, "loss": 4.4781, "loss/crossentropy": 1.9188589453697205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23544569313526154, "step": 4272 }, { "epoch": 0.08548, "grad_norm": 2.328125, "grad_norm_var": 0.004264322916666666, "learning_rate": 0.0001, "loss": 4.704, "loss/crossentropy": 2.4337977170944214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2951700836420059, "step": 4274 }, { "epoch": 0.08552, "grad_norm": 2.359375, "grad_norm_var": 0.003902180989583333, "learning_rate": 0.0001, "loss": 4.7051, "loss/crossentropy": 1.9108383059501648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24459081888198853, "step": 4276 }, { "epoch": 0.08556, "grad_norm": 2.375, "grad_norm_var": 0.003123982747395833, "learning_rate": 0.0001, "loss": 4.3751, "loss/crossentropy": 1.6632736921310425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22495487332344055, "step": 4278 }, { "epoch": 0.0856, "grad_norm": 2.296875, "grad_norm_var": 0.021312459309895834, "learning_rate": 0.0001, "loss": 4.8144, "loss/crossentropy": 2.519997477531433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2657178193330765, "step": 4280 }, { "epoch": 0.08564, "grad_norm": 2.640625, "grad_norm_var": 0.025275675455729167, "learning_rate": 0.0001, "loss": 4.6837, "loss/crossentropy": 2.150822162628174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30192580074071884, "step": 4282 }, { "epoch": 0.08568, "grad_norm": 2.421875, "grad_norm_var": 0.023763020833333332, "learning_rate": 0.0001, "loss": 4.7411, "loss/crossentropy": 1.9970062971115112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24695321917533875, "step": 4284 }, { "epoch": 0.08572, "grad_norm": 2.25, "grad_norm_var": 0.024372355143229166, "learning_rate": 0.0001, "loss": 4.5361, "loss/crossentropy": 2.3136098384857178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25082121044397354, "step": 4286 }, { "epoch": 0.08576, "grad_norm": 2.5625, "grad_norm_var": 0.025992838541666667, "learning_rate": 0.0001, "loss": 4.9171, "loss/crossentropy": 2.112035870552063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27019062638282776, "step": 4288 }, { "epoch": 0.0858, "grad_norm": 2.328125, "grad_norm_var": 0.025992838541666667, "learning_rate": 0.0001, "loss": 4.4985, "loss/crossentropy": 2.068653643131256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24297921359539032, "step": 4290 }, { "epoch": 0.08584, "grad_norm": 2.375, "grad_norm_var": 0.025520833333333333, "learning_rate": 0.0001, "loss": 4.5182, "loss/crossentropy": 1.9013578295707703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23058265447616577, "step": 4292 }, { "epoch": 0.08588, "grad_norm": 2.15625, "grad_norm_var": 0.0294830322265625, "learning_rate": 0.0001, "loss": 4.6825, "loss/crossentropy": 2.149984359741211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25223904848098755, "step": 4294 }, { "epoch": 0.08592, "grad_norm": 2.328125, "grad_norm_var": 0.014469401041666666, "learning_rate": 0.0001, "loss": 4.4109, "loss/crossentropy": 1.894010066986084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2323223054409027, "step": 4296 }, { "epoch": 0.08596, "grad_norm": 2.421875, "grad_norm_var": 0.010123697916666667, "learning_rate": 0.0001, "loss": 4.7653, "loss/crossentropy": 2.3351621627807617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27194930613040924, "step": 4298 }, { "epoch": 0.086, "grad_norm": 2.328125, "grad_norm_var": 0.0097320556640625, "learning_rate": 0.0001, "loss": 4.741, "loss/crossentropy": 2.224352180957794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24884501099586487, "step": 4300 }, { "epoch": 0.08604, "grad_norm": 2.421875, "grad_norm_var": 0.008854166666666666, "learning_rate": 0.0001, "loss": 4.6592, "loss/crossentropy": 1.908318042755127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24747492372989655, "step": 4302 }, { "epoch": 0.08608, "grad_norm": 2.296875, "grad_norm_var": 0.00758056640625, "learning_rate": 0.0001, "loss": 4.8566, "loss/crossentropy": 2.1990396976470947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27217794954776764, "step": 4304 }, { "epoch": 0.08612, "grad_norm": 2.390625, "grad_norm_var": 0.0098297119140625, "learning_rate": 0.0001, "loss": 4.6432, "loss/crossentropy": 2.3146010637283325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26875850558280945, "step": 4306 }, { "epoch": 0.08616, "grad_norm": 2.46875, "grad_norm_var": 0.015208943684895834, "learning_rate": 0.0001, "loss": 4.8254, "loss/crossentropy": 2.2507941722869873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27052539587020874, "step": 4308 }, { "epoch": 0.0862, "grad_norm": 2.234375, "grad_norm_var": 0.013199869791666667, "learning_rate": 0.0001, "loss": 4.4067, "loss/crossentropy": 1.9077125787734985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22987178713083267, "step": 4310 }, { "epoch": 0.08624, "grad_norm": 2.515625, "grad_norm_var": 0.01353759765625, "learning_rate": 0.0001, "loss": 4.4822, "loss/crossentropy": 1.951395332813263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2392890453338623, "step": 4312 }, { "epoch": 0.08628, "grad_norm": 2.53125, "grad_norm_var": 0.033854166666666664, "learning_rate": 0.0001, "loss": 4.5371, "loss/crossentropy": 1.9426860213279724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24374966323375702, "step": 4314 }, { "epoch": 0.08632, "grad_norm": 3.390625, "grad_norm_var": 0.09062398274739583, "learning_rate": 0.0001, "loss": 5.253, "loss/crossentropy": 2.2508288621902466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3862452507019043, "step": 4316 }, { "epoch": 0.08636, "grad_norm": 2.359375, "grad_norm_var": 0.09058329264322916, "learning_rate": 0.0001, "loss": 4.5288, "loss/crossentropy": 2.1161463260650635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557579278945923, "step": 4318 }, { "epoch": 0.0864, "grad_norm": 2.3125, "grad_norm_var": 0.09374593098958334, "learning_rate": 0.0001, "loss": 5.1146, "loss/crossentropy": 2.2570544481277466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31447017192840576, "step": 4320 }, { "epoch": 0.08644, "grad_norm": 2.328125, "grad_norm_var": 0.08740234375, "learning_rate": 0.0001, "loss": 4.9724, "loss/crossentropy": 2.3211100101470947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26107798516750336, "step": 4322 }, { "epoch": 0.08648, "grad_norm": 2.25, "grad_norm_var": 0.09231669108072917, "learning_rate": 0.0001, "loss": 4.5236, "loss/crossentropy": 2.1451058387756348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23633568733930588, "step": 4324 }, { "epoch": 0.08652, "grad_norm": 2.21875, "grad_norm_var": 0.09463602701822917, "learning_rate": 0.0001, "loss": 4.5828, "loss/crossentropy": 1.9880141615867615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23688867688179016, "step": 4326 }, { "epoch": 0.08656, "grad_norm": 2.296875, "grad_norm_var": 0.09724934895833333, "learning_rate": 0.0001, "loss": 4.7098, "loss/crossentropy": 2.021056890487671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2543798238039017, "step": 4328 }, { "epoch": 0.0866, "grad_norm": 2.359375, "grad_norm_var": 0.0814453125, "learning_rate": 0.0001, "loss": 4.6439, "loss/crossentropy": 2.1323755979537964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27410852909088135, "step": 4330 }, { "epoch": 0.08664, "grad_norm": 2.296875, "grad_norm_var": 0.014940388997395833, "learning_rate": 0.0001, "loss": 4.6945, "loss/crossentropy": 1.9674875736236572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24818265438079834, "step": 4332 }, { "epoch": 0.08668, "grad_norm": 2.25, "grad_norm_var": 0.01490478515625, "learning_rate": 0.0001, "loss": 4.1911, "loss/crossentropy": 2.0466583967208862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2567761391401291, "step": 4334 }, { "epoch": 0.08672, "grad_norm": 2.375, "grad_norm_var": 0.004423014322916667, "learning_rate": 0.0001, "loss": 4.5937, "loss/crossentropy": 2.138857126235962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26430967450141907, "step": 4336 }, { "epoch": 0.08676, "grad_norm": 2.390625, "grad_norm_var": 0.003902180989583333, "learning_rate": 0.0001, "loss": 4.7168, "loss/crossentropy": 2.164841413497925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24389629065990448, "step": 4338 }, { "epoch": 0.0868, "grad_norm": 2.4375, "grad_norm_var": 0.005322265625, "learning_rate": 0.0001, "loss": 4.6017, "loss/crossentropy": 2.2220189571380615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24361558258533478, "step": 4340 }, { "epoch": 0.08684, "grad_norm": 2.390625, "grad_norm_var": 0.004792277018229167, "learning_rate": 0.0001, "loss": 4.3088, "loss/crossentropy": 1.7106285095214844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21558403968811035, "step": 4342 }, { "epoch": 0.08688, "grad_norm": 2.203125, "grad_norm_var": 0.0053670247395833336, "learning_rate": 0.0001, "loss": 4.1647, "loss/crossentropy": 1.9200173020362854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2502904310822487, "step": 4344 }, { "epoch": 0.08692, "grad_norm": 2.828125, "grad_norm_var": 0.020796712239583334, "learning_rate": 0.0001, "loss": 4.6817, "loss/crossentropy": 1.883722960948944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24818243086338043, "step": 4346 }, { "epoch": 0.08696, "grad_norm": 2.484375, "grad_norm_var": 0.026146443684895833, "learning_rate": 0.0001, "loss": 4.7509, "loss/crossentropy": 2.2069878578186035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2518744319677353, "step": 4348 }, { "epoch": 0.087, "grad_norm": 2.484375, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 4.7235, "loss/crossentropy": 2.2158325910568237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2551049590110779, "step": 4350 }, { "epoch": 0.08704, "grad_norm": 2.359375, "grad_norm_var": 0.02730712890625, "learning_rate": 0.0001, "loss": 4.8406, "loss/crossentropy": 2.0580105781555176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2742728739976883, "step": 4352 }, { "epoch": 0.08708, "grad_norm": 2.34375, "grad_norm_var": 0.02906494140625, "learning_rate": 0.0001, "loss": 4.5267, "loss/crossentropy": 2.190276265144348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.274506613612175, "step": 4354 }, { "epoch": 0.08712, "grad_norm": 2.3125, "grad_norm_var": 0.027242024739583332, "learning_rate": 0.0001, "loss": 4.7499, "loss/crossentropy": 2.2595328092575073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25295622646808624, "step": 4356 }, { "epoch": 0.08716, "grad_norm": 2.4375, "grad_norm_var": 0.027391560872395835, "learning_rate": 0.0001, "loss": 5.1247, "loss/crossentropy": 2.322342872619629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2490948587656021, "step": 4358 }, { "epoch": 0.0872, "grad_norm": 2.328125, "grad_norm_var": 0.024665323893229167, "learning_rate": 0.0001, "loss": 4.7045, "loss/crossentropy": 2.108223795890808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25404803454875946, "step": 4360 }, { "epoch": 0.08724, "grad_norm": 2.3125, "grad_norm_var": 0.013825480143229167, "learning_rate": 0.0001, "loss": 4.5332, "loss/crossentropy": 2.1363136768341064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2679157853126526, "step": 4362 }, { "epoch": 0.08728, "grad_norm": 2.390625, "grad_norm_var": 0.0076171875, "learning_rate": 0.0001, "loss": 4.6418, "loss/crossentropy": 2.3207738399505615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2725762128829956, "step": 4364 }, { "epoch": 0.08732, "grad_norm": 2.40625, "grad_norm_var": 0.004813639322916666, "learning_rate": 0.0001, "loss": 4.7431, "loss/crossentropy": 2.3179128170013428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27275949716567993, "step": 4366 }, { "epoch": 0.08736, "grad_norm": 2.328125, "grad_norm_var": 0.005182902018229167, "learning_rate": 0.0001, "loss": 4.7355, "loss/crossentropy": 2.2130206823349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2787477523088455, "step": 4368 }, { "epoch": 0.0874, "grad_norm": 2.40625, "grad_norm_var": 0.004548136393229167, "learning_rate": 0.0001, "loss": 4.6193, "loss/crossentropy": 2.3350926637649536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26483266800642014, "step": 4370 }, { "epoch": 0.08744, "grad_norm": 2.4375, "grad_norm_var": 0.005736287434895833, "learning_rate": 0.0001, "loss": 4.7329, "loss/crossentropy": 1.9162638187408447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23748356848955154, "step": 4372 }, { "epoch": 0.08748, "grad_norm": 2.484375, "grad_norm_var": 0.006322224934895833, "learning_rate": 0.0001, "loss": 4.7045, "loss/crossentropy": 2.2708429098129272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2709425985813141, "step": 4374 }, { "epoch": 0.08752, "grad_norm": 2.234375, "grad_norm_var": 0.011604817708333333, "learning_rate": 0.0001, "loss": 4.3481, "loss/crossentropy": 1.7216318845748901, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22498781234025955, "step": 4376 }, { "epoch": 0.08756, "grad_norm": 2.328125, "grad_norm_var": 0.011937459309895834, "learning_rate": 0.0001, "loss": 4.4261, "loss/crossentropy": 2.144331693649292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25400668382644653, "step": 4378 }, { "epoch": 0.0876, "grad_norm": 2.328125, "grad_norm_var": 0.011750284830729167, "learning_rate": 0.0001, "loss": 4.5617, "loss/crossentropy": 2.305369734764099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29137127101421356, "step": 4380 }, { "epoch": 0.08764, "grad_norm": 2.65625, "grad_norm_var": 0.018115234375, "learning_rate": 0.0001, "loss": 4.6861, "loss/crossentropy": 2.1156765818595886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24203064292669296, "step": 4382 }, { "epoch": 0.08768, "grad_norm": 2.28125, "grad_norm_var": 0.016630045572916665, "learning_rate": 0.0001, "loss": 4.4544, "loss/crossentropy": 1.9081769585609436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22396781295537949, "step": 4384 }, { "epoch": 0.08772, "grad_norm": 2.265625, "grad_norm_var": 0.017769368489583333, "learning_rate": 0.0001, "loss": 4.704, "loss/crossentropy": 2.0938609838485718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27330371737480164, "step": 4386 }, { "epoch": 0.08776, "grad_norm": 2.34375, "grad_norm_var": 0.017748006184895835, "learning_rate": 0.0001, "loss": 4.1179, "loss/crossentropy": 1.9685207605361938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2287725731730461, "step": 4388 }, { "epoch": 0.0878, "grad_norm": 2.3125, "grad_norm_var": 0.016109212239583334, "learning_rate": 0.0001, "loss": 4.5233, "loss/crossentropy": 1.7536925673484802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2345883920788765, "step": 4390 }, { "epoch": 0.08784, "grad_norm": 2.359375, "grad_norm_var": 0.029150390625, "learning_rate": 0.0001, "loss": 4.3897, "loss/crossentropy": 2.145567834377289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26126645505428314, "step": 4392 }, { "epoch": 0.08788, "grad_norm": 2.390625, "grad_norm_var": 0.028348795572916665, "learning_rate": 0.0001, "loss": 4.4677, "loss/crossentropy": 2.2211687564849854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3332909345626831, "step": 4394 }, { "epoch": 0.08792, "grad_norm": 2.375, "grad_norm_var": 0.028539021809895832, "learning_rate": 0.0001, "loss": 4.5545, "loss/crossentropy": 1.919084072113037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2480143904685974, "step": 4396 }, { "epoch": 0.08796, "grad_norm": 2.421875, "grad_norm_var": 0.025121053059895832, "learning_rate": 0.0001, "loss": 4.6408, "loss/crossentropy": 2.1769548654556274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.269208699464798, "step": 4398 }, { "epoch": 0.088, "grad_norm": 2.5, "grad_norm_var": 0.028706868489583332, "learning_rate": 0.0001, "loss": 4.7372, "loss/crossentropy": 1.8480086922645569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2432136833667755, "step": 4400 }, { "epoch": 0.08804, "grad_norm": 2.46875, "grad_norm_var": 0.026838175455729165, "learning_rate": 0.0001, "loss": 4.7303, "loss/crossentropy": 2.1948903799057007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28328536450862885, "step": 4402 }, { "epoch": 0.08808, "grad_norm": 2.40625, "grad_norm_var": 0.022554524739583335, "learning_rate": 0.0001, "loss": 4.6929, "loss/crossentropy": 2.163570761680603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.282375693321228, "step": 4404 }, { "epoch": 0.08812, "grad_norm": 2.34375, "grad_norm_var": 0.022359212239583332, "learning_rate": 0.0001, "loss": 4.6882, "loss/crossentropy": 2.3737696409225464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26703887432813644, "step": 4406 }, { "epoch": 0.08816, "grad_norm": 2.71875, "grad_norm_var": 0.015192667643229166, "learning_rate": 0.0001, "loss": 4.6959, "loss/crossentropy": 2.2449779510498047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2744368612766266, "step": 4408 }, { "epoch": 0.0882, "grad_norm": 2.515625, "grad_norm_var": 0.015348307291666667, "learning_rate": 0.0001, "loss": 4.6602, "loss/crossentropy": 2.1167399287223816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28688907623291016, "step": 4410 }, { "epoch": 0.08824, "grad_norm": 2.3125, "grad_norm_var": 0.017118326822916665, "learning_rate": 0.0001, "loss": 4.3777, "loss/crossentropy": 2.2249929904937744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24534232914447784, "step": 4412 }, { "epoch": 0.08828, "grad_norm": 2.421875, "grad_norm_var": 0.019261678059895832, "learning_rate": 0.0001, "loss": 4.7013, "loss/crossentropy": 2.172566771507263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2783561646938324, "step": 4414 }, { "epoch": 0.08832, "grad_norm": 2.234375, "grad_norm_var": 0.018684895833333333, "learning_rate": 0.0001, "loss": 4.3536, "loss/crossentropy": 2.0709031224250793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419627606868744, "step": 4416 }, { "epoch": 0.08836, "grad_norm": 3.359375, "grad_norm_var": 0.0735260009765625, "learning_rate": 0.0001, "loss": 4.8378, "loss/crossentropy": 2.2390655279159546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27328142523765564, "step": 4418 }, { "epoch": 0.0884, "grad_norm": 2.59375, "grad_norm_var": 0.07681884765625, "learning_rate": 0.0001, "loss": 4.6879, "loss/crossentropy": 2.061118960380554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23206621408462524, "step": 4420 }, { "epoch": 0.08844, "grad_norm": 2.140625, "grad_norm_var": 0.09147847493489583, "learning_rate": 0.0001, "loss": 4.3028, "loss/crossentropy": 1.4919558763504028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17881463468074799, "step": 4422 }, { "epoch": 0.08848, "grad_norm": 2.53125, "grad_norm_var": 0.08662109375, "learning_rate": 0.0001, "loss": 4.7705, "loss/crossentropy": 2.267301082611084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27718164026737213, "step": 4424 }, { "epoch": 0.08852, "grad_norm": 2.375, "grad_norm_var": 0.0862457275390625, "learning_rate": 0.0001, "loss": 4.635, "loss/crossentropy": 1.9008094668388367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22865734994411469, "step": 4426 }, { "epoch": 0.08856, "grad_norm": 2.421875, "grad_norm_var": 0.090966796875, "learning_rate": 0.0001, "loss": 4.385, "loss/crossentropy": 1.8788060545921326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23521529138088226, "step": 4428 }, { "epoch": 0.0886, "grad_norm": 2.546875, "grad_norm_var": 0.08844401041666666, "learning_rate": 0.0001, "loss": 4.6703, "loss/crossentropy": 2.0775802731513977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3402235209941864, "step": 4430 }, { "epoch": 0.08864, "grad_norm": 2.515625, "grad_norm_var": 0.08642171223958334, "learning_rate": 0.0001, "loss": 4.5722, "loss/crossentropy": 2.0950201749801636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2665044367313385, "step": 4432 }, { "epoch": 0.08868, "grad_norm": 2.40625, "grad_norm_var": 0.026167805989583334, "learning_rate": 0.0001, "loss": 4.9235, "loss/crossentropy": 2.328200340270996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24911007285118103, "step": 4434 }, { "epoch": 0.08872, "grad_norm": 2.515625, "grad_norm_var": 0.023273722330729166, "learning_rate": 0.0001, "loss": 4.7462, "loss/crossentropy": 2.1840142011642456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2855416387319565, "step": 4436 }, { "epoch": 0.08876, "grad_norm": 2.25, "grad_norm_var": 0.015559895833333334, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 1.6167555451393127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21646380424499512, "step": 4438 }, { "epoch": 0.0888, "grad_norm": 2.25, "grad_norm_var": 0.015543619791666666, "learning_rate": 0.0001, "loss": 4.5274, "loss/crossentropy": 1.9902858138084412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.237789124250412, "step": 4440 }, { "epoch": 0.08884, "grad_norm": 2.359375, "grad_norm_var": 0.016357421875, "learning_rate": 0.0001, "loss": 4.6682, "loss/crossentropy": 2.4779287576675415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2726414203643799, "step": 4442 }, { "epoch": 0.08888, "grad_norm": 2.46875, "grad_norm_var": 0.0116119384765625, "learning_rate": 0.0001, "loss": 4.7415, "loss/crossentropy": 2.0914896726608276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22887279838323593, "step": 4444 }, { "epoch": 0.08892, "grad_norm": 2.265625, "grad_norm_var": 0.010570271809895834, "learning_rate": 0.0001, "loss": 4.498, "loss/crossentropy": 2.0526055693626404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2509382963180542, "step": 4446 }, { "epoch": 0.08896, "grad_norm": 2.15625, "grad_norm_var": 0.010347493489583333, "learning_rate": 0.0001, "loss": 4.4306, "loss/crossentropy": 1.9779084920883179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2320951297879219, "step": 4448 }, { "epoch": 0.089, "grad_norm": 2.421875, "grad_norm_var": 0.010399373372395833, "learning_rate": 0.0001, "loss": 4.7725, "loss/crossentropy": 2.2081698179244995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29023079574108124, "step": 4450 }, { "epoch": 0.08904, "grad_norm": 2.296875, "grad_norm_var": 0.0074127197265625, "learning_rate": 0.0001, "loss": 4.542, "loss/crossentropy": 1.834806501865387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23430980741977692, "step": 4452 }, { "epoch": 0.08908, "grad_norm": 2.28125, "grad_norm_var": 0.006884765625, "learning_rate": 0.0001, "loss": 4.7087, "loss/crossentropy": 2.4750468730926514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27945323288440704, "step": 4454 }, { "epoch": 0.08912, "grad_norm": 2.28125, "grad_norm_var": 0.007255045572916666, "learning_rate": 0.0001, "loss": 4.6822, "loss/crossentropy": 2.1766942739486694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28881968557834625, "step": 4456 }, { "epoch": 0.08916, "grad_norm": 2.28125, "grad_norm_var": 0.009065755208333333, "learning_rate": 0.0001, "loss": 4.9204, "loss/crossentropy": 2.265195846557617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2616717368364334, "step": 4458 }, { "epoch": 0.0892, "grad_norm": 2.171875, "grad_norm_var": 0.011473592122395833, "learning_rate": 0.0001, "loss": 4.5747, "loss/crossentropy": 2.2438716888427734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2630545049905777, "step": 4460 }, { "epoch": 0.08924, "grad_norm": 2.28125, "grad_norm_var": 0.012572224934895833, "learning_rate": 0.0001, "loss": 4.3404, "loss/crossentropy": 2.060324013233185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23639824986457825, "step": 4462 }, { "epoch": 0.08928, "grad_norm": 2.453125, "grad_norm_var": 0.0331207275390625, "learning_rate": 0.0001, "loss": 4.733, "loss/crossentropy": 1.8830525279045105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21016474813222885, "step": 4464 }, { "epoch": 0.08932, "grad_norm": 2.484375, "grad_norm_var": 0.03435872395833333, "learning_rate": 0.0001, "loss": 4.7571, "loss/crossentropy": 2.3420846462249756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28108468651771545, "step": 4466 }, { "epoch": 0.08936, "grad_norm": 2.515625, "grad_norm_var": 0.0337310791015625, "learning_rate": 0.0001, "loss": 4.6851, "loss/crossentropy": 1.9140342473983765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22587041556835175, "step": 4468 }, { "epoch": 0.0894, "grad_norm": 2.390625, "grad_norm_var": 0.032613118489583336, "learning_rate": 0.0001, "loss": 4.79, "loss/crossentropy": 1.9753122925758362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24506878852844238, "step": 4470 }, { "epoch": 0.08944, "grad_norm": 2.3125, "grad_norm_var": 0.032389322916666664, "learning_rate": 0.0001, "loss": 4.7769, "loss/crossentropy": 2.0735195875167847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2763114273548126, "step": 4472 }, { "epoch": 0.08948, "grad_norm": 2.28125, "grad_norm_var": 0.034403483072916664, "learning_rate": 0.0001, "loss": 4.3593, "loss/crossentropy": 1.7784460186958313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22829323261976242, "step": 4474 }, { "epoch": 0.08952, "grad_norm": 2.625, "grad_norm_var": 0.051041666666666666, "learning_rate": 0.0001, "loss": 4.9451, "loss/crossentropy": 2.1188095808029175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27410270273685455, "step": 4476 }, { "epoch": 0.08956, "grad_norm": 2.421875, "grad_norm_var": 0.051634724934895834, "learning_rate": 0.0001, "loss": 4.3392, "loss/crossentropy": 2.320235252380371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27514997124671936, "step": 4478 }, { "epoch": 0.0896, "grad_norm": 2.265625, "grad_norm_var": 0.03766988118489583, "learning_rate": 0.0001, "loss": 4.8345, "loss/crossentropy": 2.3023080825805664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2516366094350815, "step": 4480 }, { "epoch": 0.08964, "grad_norm": 2.390625, "grad_norm_var": 0.039567057291666666, "learning_rate": 0.0001, "loss": 4.6698, "loss/crossentropy": 1.9677563905715942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2362382560968399, "step": 4482 }, { "epoch": 0.08968, "grad_norm": 2.34375, "grad_norm_var": 0.03951416015625, "learning_rate": 0.0001, "loss": 4.9159, "loss/crossentropy": 2.2005198001861572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2760336175560951, "step": 4484 }, { "epoch": 0.08972, "grad_norm": 2.0625, "grad_norm_var": 0.0478912353515625, "learning_rate": 0.0001, "loss": 4.4418, "loss/crossentropy": 1.9799931049346924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25132423639297485, "step": 4486 }, { "epoch": 0.08976, "grad_norm": 2.46875, "grad_norm_var": 0.10212300618489584, "learning_rate": 0.0001, "loss": 4.9643, "loss/crossentropy": 2.205371141433716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2695635259151459, "step": 4488 }, { "epoch": 0.0898, "grad_norm": 2.421875, "grad_norm_var": 0.09692281087239583, "learning_rate": 0.0001, "loss": 4.9695, "loss/crossentropy": 2.3500062227249146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2736963629722595, "step": 4490 }, { "epoch": 0.08984, "grad_norm": 2.59375, "grad_norm_var": 0.07867431640625, "learning_rate": 0.0001, "loss": 4.9083, "loss/crossentropy": 2.386352837085724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2472890019416809, "step": 4492 }, { "epoch": 0.08988, "grad_norm": 2.265625, "grad_norm_var": 0.07701416015625, "learning_rate": 0.0001, "loss": 4.3598, "loss/crossentropy": 1.9863982200622559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2288871705532074, "step": 4494 }, { "epoch": 0.08992, "grad_norm": 2.34375, "grad_norm_var": 0.07757059733072917, "learning_rate": 0.0001, "loss": 4.6157, "loss/crossentropy": 2.4088594913482666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24304527044296265, "step": 4496 }, { "epoch": 0.08996, "grad_norm": 2.5, "grad_norm_var": 0.07517801920572917, "learning_rate": 0.0001, "loss": 4.7457, "loss/crossentropy": 2.1663140058517456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2681227922439575, "step": 4498 }, { "epoch": 0.09, "grad_norm": 2.171875, "grad_norm_var": 0.0810455322265625, "learning_rate": 0.0001, "loss": 4.2199, "loss/crossentropy": 1.9233570098876953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23169977217912674, "step": 4500 }, { "epoch": 0.09004, "grad_norm": 2.40625, "grad_norm_var": 0.07579752604166666, "learning_rate": 0.0001, "loss": 4.4677, "loss/crossentropy": 2.2940425872802734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24809539318084717, "step": 4502 }, { "epoch": 0.09008, "grad_norm": 2.21875, "grad_norm_var": 0.015771484375, "learning_rate": 0.0001, "loss": 4.5333, "loss/crossentropy": 2.1874176263809204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26286616921424866, "step": 4504 }, { "epoch": 0.09012, "grad_norm": 2.28125, "grad_norm_var": 0.015868123372395834, "learning_rate": 0.0001, "loss": 4.5029, "loss/crossentropy": 2.190543472766876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26766398549079895, "step": 4506 }, { "epoch": 0.09016, "grad_norm": 2.296875, "grad_norm_var": 0.010823567708333334, "learning_rate": 0.0001, "loss": 4.464, "loss/crossentropy": 2.132491707801819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2514026165008545, "step": 4508 }, { "epoch": 0.0902, "grad_norm": 2.296875, "grad_norm_var": 0.0107818603515625, "learning_rate": 0.0001, "loss": 4.5032, "loss/crossentropy": 2.1492353677749634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25663119554519653, "step": 4510 }, { "epoch": 0.09024, "grad_norm": 2.203125, "grad_norm_var": 0.011454264322916666, "learning_rate": 0.0001, "loss": 4.6166, "loss/crossentropy": 2.2471927404403687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24812395125627518, "step": 4512 }, { "epoch": 0.09028, "grad_norm": 2.34375, "grad_norm_var": 0.007013956705729167, "learning_rate": 0.0001, "loss": 4.5651, "loss/crossentropy": 2.1944304704666138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26567137241363525, "step": 4514 }, { "epoch": 0.09032, "grad_norm": 2.203125, "grad_norm_var": 0.006883748372395833, "learning_rate": 0.0001, "loss": 4.3054, "loss/crossentropy": 1.7537739872932434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21784386038780212, "step": 4516 }, { "epoch": 0.09036, "grad_norm": 2.46875, "grad_norm_var": 0.006981404622395834, "learning_rate": 0.0001, "loss": 4.6961, "loss/crossentropy": 2.133580207824707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27865441143512726, "step": 4518 }, { "epoch": 0.0904, "grad_norm": 2.5, "grad_norm_var": 0.008723958333333334, "learning_rate": 0.0001, "loss": 4.5486, "loss/crossentropy": 2.0858315229415894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26390860974788666, "step": 4520 }, { "epoch": 0.09044, "grad_norm": 2.3125, "grad_norm_var": 0.008101399739583333, "learning_rate": 0.0001, "loss": 4.6833, "loss/crossentropy": 1.9084516763687134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23013630509376526, "step": 4522 }, { "epoch": 0.09048, "grad_norm": 2.234375, "grad_norm_var": 0.00859375, "learning_rate": 0.0001, "loss": 4.5817, "loss/crossentropy": 2.2123712301254272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2739466577768326, "step": 4524 }, { "epoch": 0.09052, "grad_norm": 2.453125, "grad_norm_var": 0.009691365559895833, "learning_rate": 0.0001, "loss": 4.9898, "loss/crossentropy": 2.3532934188842773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25509175658226013, "step": 4526 }, { "epoch": 0.09056, "grad_norm": 2.328125, "grad_norm_var": 0.008137003580729166, "learning_rate": 0.0001, "loss": 4.7786, "loss/crossentropy": 2.1543048620224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23639082163572311, "step": 4528 }, { "epoch": 0.0906, "grad_norm": 2.4375, "grad_norm_var": 0.0109527587890625, "learning_rate": 0.0001, "loss": 4.647, "loss/crossentropy": 2.1322286128997803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23874164372682571, "step": 4530 }, { "epoch": 0.09064, "grad_norm": 2.25, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 4.7623, "loss/crossentropy": 2.5552597045898438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24949757009744644, "step": 4532 }, { "epoch": 0.09068, "grad_norm": 2.421875, "grad_norm_var": 0.0117095947265625, "learning_rate": 0.0001, "loss": 4.6999, "loss/crossentropy": 2.1248819231987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2795845717191696, "step": 4534 }, { "epoch": 0.09072, "grad_norm": 2.390625, "grad_norm_var": 0.014655558268229167, "learning_rate": 0.0001, "loss": 4.7739, "loss/crossentropy": 1.985984206199646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22764238715171814, "step": 4536 }, { "epoch": 0.09076, "grad_norm": 2.328125, "grad_norm_var": 0.01519775390625, "learning_rate": 0.0001, "loss": 4.64, "loss/crossentropy": 2.220720648765564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2693287283182144, "step": 4538 }, { "epoch": 0.0908, "grad_norm": 2.3125, "grad_norm_var": 0.01324462890625, "learning_rate": 0.0001, "loss": 4.486, "loss/crossentropy": 1.9541595578193665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419680804014206, "step": 4540 }, { "epoch": 0.09084, "grad_norm": 2.53125, "grad_norm_var": 0.013916015625, "learning_rate": 0.0001, "loss": 4.963, "loss/crossentropy": 2.275113582611084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2562567666172981, "step": 4542 }, { "epoch": 0.09088, "grad_norm": 2.3125, "grad_norm_var": 0.017122395833333335, "learning_rate": 0.0001, "loss": 4.7535, "loss/crossentropy": 2.4411803483963013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25798996537923813, "step": 4544 }, { "epoch": 0.09092, "grad_norm": 2.34375, "grad_norm_var": 0.019823201497395835, "learning_rate": 0.0001, "loss": 4.7339, "loss/crossentropy": 2.035769820213318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25313758105039597, "step": 4546 }, { "epoch": 0.09096, "grad_norm": 2.171875, "grad_norm_var": 0.018701171875, "learning_rate": 0.0001, "loss": 4.1331, "loss/crossentropy": 1.9237529039382935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24456002563238144, "step": 4548 }, { "epoch": 0.091, "grad_norm": 2.234375, "grad_norm_var": 0.02281494140625, "learning_rate": 0.0001, "loss": 4.6762, "loss/crossentropy": 2.179704189300537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2681535929441452, "step": 4550 }, { "epoch": 0.09104, "grad_norm": 2.46875, "grad_norm_var": 0.024137369791666665, "learning_rate": 0.0001, "loss": 4.6899, "loss/crossentropy": 2.013023316860199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22643990069627762, "step": 4552 }, { "epoch": 0.09108, "grad_norm": 2.453125, "grad_norm_var": 0.024201456705729166, "learning_rate": 0.0001, "loss": 4.6527, "loss/crossentropy": 2.173883557319641, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26495447754859924, "step": 4554 }, { "epoch": 0.09112, "grad_norm": 2.296875, "grad_norm_var": 0.025121053059895832, "learning_rate": 0.0001, "loss": 4.5323, "loss/crossentropy": 1.9398870468139648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24307211488485336, "step": 4556 }, { "epoch": 0.09116, "grad_norm": 2.28125, "grad_norm_var": 0.025423177083333335, "learning_rate": 0.0001, "loss": 4.2028, "loss/crossentropy": 1.8551223874092102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21659143269062042, "step": 4558 }, { "epoch": 0.0912, "grad_norm": 2.375, "grad_norm_var": 0.0254058837890625, "learning_rate": 0.0001, "loss": 4.7813, "loss/crossentropy": 2.104207456111908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26515253633260727, "step": 4560 }, { "epoch": 0.09124, "grad_norm": 2.484375, "grad_norm_var": 0.023875935872395834, "learning_rate": 0.0001, "loss": 4.6241, "loss/crossentropy": 2.31631863117218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25454505532979965, "step": 4562 }, { "epoch": 0.09128, "grad_norm": 2.265625, "grad_norm_var": 0.021800740559895834, "learning_rate": 0.0001, "loss": 4.5732, "loss/crossentropy": 2.331356406211853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2667195200920105, "step": 4564 }, { "epoch": 0.09132, "grad_norm": 2.515625, "grad_norm_var": 0.0176910400390625, "learning_rate": 0.0001, "loss": 4.6714, "loss/crossentropy": 2.2126184701919556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24132181704044342, "step": 4566 }, { "epoch": 0.09136, "grad_norm": 2.25, "grad_norm_var": 0.013167317708333333, "learning_rate": 0.0001, "loss": 4.5753, "loss/crossentropy": 2.330659508705139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2838585078716278, "step": 4568 }, { "epoch": 0.0914, "grad_norm": 2.28125, "grad_norm_var": 0.0132720947265625, "learning_rate": 0.0001, "loss": 4.4935, "loss/crossentropy": 2.167214035987854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24021611362695694, "step": 4570 }, { "epoch": 0.09144, "grad_norm": 2.15625, "grad_norm_var": 0.014046223958333333, "learning_rate": 0.0001, "loss": 4.5847, "loss/crossentropy": 1.770102322101593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108836993575096, "step": 4572 }, { "epoch": 0.09148, "grad_norm": 2.234375, "grad_norm_var": 0.015425618489583333, "learning_rate": 0.0001, "loss": 4.2906, "loss/crossentropy": 1.9293717741966248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2186967208981514, "step": 4574 }, { "epoch": 0.09152, "grad_norm": 2.484375, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 4.8128, "loss/crossentropy": 2.4099135398864746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25587645173072815, "step": 4576 }, { "epoch": 0.09156, "grad_norm": 2.359375, "grad_norm_var": 0.06220296223958333, "learning_rate": 0.0001, "loss": 4.5727, "loss/crossentropy": 1.7967870831489563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2521464377641678, "step": 4578 }, { "epoch": 0.0916, "grad_norm": 2.46875, "grad_norm_var": 0.0616851806640625, "learning_rate": 0.0001, "loss": 4.7604, "loss/crossentropy": 1.989583432674408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22892683744430542, "step": 4580 }, { "epoch": 0.09164, "grad_norm": 2.625, "grad_norm_var": 0.0626953125, "learning_rate": 0.0001, "loss": 4.4986, "loss/crossentropy": 2.167198657989502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24552703648805618, "step": 4582 }, { "epoch": 0.09168, "grad_norm": 2.453125, "grad_norm_var": 0.06129150390625, "learning_rate": 0.0001, "loss": 4.8237, "loss/crossentropy": 2.213137984275818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2614079788327217, "step": 4584 }, { "epoch": 0.09172, "grad_norm": 2.265625, "grad_norm_var": 0.06194661458333333, "learning_rate": 0.0001, "loss": 4.977, "loss/crossentropy": 2.3586392998695374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527218610048294, "step": 4586 }, { "epoch": 0.09176, "grad_norm": 2.53125, "grad_norm_var": 0.06054280598958333, "learning_rate": 0.0001, "loss": 4.5118, "loss/crossentropy": 2.326598286628723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2684827446937561, "step": 4588 }, { "epoch": 0.0918, "grad_norm": 2.234375, "grad_norm_var": 0.055562337239583336, "learning_rate": 0.0001, "loss": 4.4296, "loss/crossentropy": 2.1365907192230225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2705482095479965, "step": 4590 }, { "epoch": 0.09184, "grad_norm": 2.328125, "grad_norm_var": 0.056005859375, "learning_rate": 0.0001, "loss": 4.6895, "loss/crossentropy": 1.816661775112152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2188858687877655, "step": 4592 }, { "epoch": 0.09188, "grad_norm": 2.203125, "grad_norm_var": 0.020731608072916668, "learning_rate": 0.0001, "loss": 4.5953, "loss/crossentropy": 2.1533923149108887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2372177392244339, "step": 4594 }, { "epoch": 0.09192, "grad_norm": 2.984375, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 4.9472, "loss/crossentropy": 2.2175614833831787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27555912733078003, "step": 4596 }, { "epoch": 0.09196, "grad_norm": 2.640625, "grad_norm_var": 0.04644775390625, "learning_rate": 0.0001, "loss": 5.11, "loss/crossentropy": 2.583792209625244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2970864772796631, "step": 4598 }, { "epoch": 0.092, "grad_norm": 2.40625, "grad_norm_var": 0.046751912434895834, "learning_rate": 0.0001, "loss": 4.5761, "loss/crossentropy": 2.227868676185608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.259210504591465, "step": 4600 }, { "epoch": 0.09204, "grad_norm": 2.265625, "grad_norm_var": 0.047118123372395834, "learning_rate": 0.0001, "loss": 4.4125, "loss/crossentropy": 2.146941900253296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26443855464458466, "step": 4602 }, { "epoch": 0.09208, "grad_norm": 2.296875, "grad_norm_var": 0.04439697265625, "learning_rate": 0.0001, "loss": 4.8655, "loss/crossentropy": 2.129795551300049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2451881766319275, "step": 4604 }, { "epoch": 0.09212, "grad_norm": 2.8125, "grad_norm_var": 0.0506256103515625, "learning_rate": 0.0001, "loss": 4.895, "loss/crossentropy": 2.2696213722229004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26037923991680145, "step": 4606 }, { "epoch": 0.09216, "grad_norm": 3.03125, "grad_norm_var": 0.07105712890625, "learning_rate": 0.0001, "loss": 4.7424, "loss/crossentropy": 2.1916056871414185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2648691013455391, "step": 4608 }, { "epoch": 0.0922, "grad_norm": 2.15625, "grad_norm_var": 0.07119038899739584, "learning_rate": 0.0001, "loss": 4.5021, "loss/crossentropy": 1.975761890411377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22248996049165726, "step": 4610 }, { "epoch": 0.09224, "grad_norm": 2.265625, "grad_norm_var": 0.053511555989583334, "learning_rate": 0.0001, "loss": 4.6596, "loss/crossentropy": 2.403375506401062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24553030729293823, "step": 4612 }, { "epoch": 0.09228, "grad_norm": 2.390625, "grad_norm_var": 0.049637858072916666, "learning_rate": 0.0001, "loss": 4.4542, "loss/crossentropy": 1.9468475580215454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24023501574993134, "step": 4614 }, { "epoch": 0.09232, "grad_norm": 2.40625, "grad_norm_var": 0.0534088134765625, "learning_rate": 0.0001, "loss": 4.8095, "loss/crossentropy": 2.013557195663452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23829826712608337, "step": 4616 }, { "epoch": 0.09236, "grad_norm": 2.265625, "grad_norm_var": 0.0519439697265625, "learning_rate": 0.0001, "loss": 4.5736, "loss/crossentropy": 2.0552549958229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2391991689801216, "step": 4618 }, { "epoch": 0.0924, "grad_norm": 2.28125, "grad_norm_var": 0.053694661458333334, "learning_rate": 0.0001, "loss": 4.4091, "loss/crossentropy": 1.8311110734939575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23821169883012772, "step": 4620 }, { "epoch": 0.09244, "grad_norm": 2.28125, "grad_norm_var": 0.04431966145833333, "learning_rate": 0.0001, "loss": 4.5034, "loss/crossentropy": 2.0346454977989197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23688799142837524, "step": 4622 }, { "epoch": 0.09248, "grad_norm": 2.484375, "grad_norm_var": 0.016304524739583333, "learning_rate": 0.0001, "loss": 4.5329, "loss/crossentropy": 1.8475716710090637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23444947600364685, "step": 4624 }, { "epoch": 0.09252, "grad_norm": 2.328125, "grad_norm_var": 0.0136383056640625, "learning_rate": 0.0001, "loss": 4.4315, "loss/crossentropy": 2.075626790523529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2663211151957512, "step": 4626 }, { "epoch": 0.09256, "grad_norm": 2.46875, "grad_norm_var": 0.0130035400390625, "learning_rate": 0.0001, "loss": 4.9214, "loss/crossentropy": 2.413028359413147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29516373574733734, "step": 4628 }, { "epoch": 0.0926, "grad_norm": 2.53125, "grad_norm_var": 0.014143880208333333, "learning_rate": 0.0001, "loss": 4.6878, "loss/crossentropy": 1.8549358248710632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23275860399007797, "step": 4630 }, { "epoch": 0.09264, "grad_norm": 2.390625, "grad_norm_var": 0.0076405843098958336, "learning_rate": 0.0001, "loss": 4.8594, "loss/crossentropy": 2.273250460624695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2511187419295311, "step": 4632 }, { "epoch": 0.09268, "grad_norm": 2.4375, "grad_norm_var": 0.013093058268229167, "learning_rate": 0.0001, "loss": 4.336, "loss/crossentropy": 1.929758369922638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25462278723716736, "step": 4634 }, { "epoch": 0.09272, "grad_norm": 2.296875, "grad_norm_var": 0.013158162434895834, "learning_rate": 0.0001, "loss": 4.5888, "loss/crossentropy": 2.0929598212242126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23592843115329742, "step": 4636 }, { "epoch": 0.09276, "grad_norm": 2.21875, "grad_norm_var": 0.015282185872395833, "learning_rate": 0.0001, "loss": 4.4595, "loss/crossentropy": 1.973824143409729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24049720913171768, "step": 4638 }, { "epoch": 0.0928, "grad_norm": 2.25, "grad_norm_var": 0.014207967122395833, "learning_rate": 0.0001, "loss": 4.2606, "loss/crossentropy": 1.8606626987457275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21698658913373947, "step": 4640 }, { "epoch": 0.09284, "grad_norm": 2.5625, "grad_norm_var": 0.01875, "learning_rate": 0.0001, "loss": 4.9827, "loss/crossentropy": 2.241386890411377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2831972986459732, "step": 4642 }, { "epoch": 0.09288, "grad_norm": 2.34375, "grad_norm_var": 0.0166656494140625, "learning_rate": 0.0001, "loss": 4.7046, "loss/crossentropy": 2.2538920640945435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2482161968946457, "step": 4644 }, { "epoch": 0.09292, "grad_norm": 2.578125, "grad_norm_var": 0.01812744140625, "learning_rate": 0.0001, "loss": 4.8557, "loss/crossentropy": 2.067206382751465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26665763556957245, "step": 4646 }, { "epoch": 0.09296, "grad_norm": 2.5625, "grad_norm_var": 0.021751912434895833, "learning_rate": 0.0001, "loss": 4.9474, "loss/crossentropy": 2.33401358127594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2752760946750641, "step": 4648 }, { "epoch": 0.093, "grad_norm": 2.4375, "grad_norm_var": 0.04914449055989583, "learning_rate": 0.0001, "loss": 4.8658, "loss/crossentropy": 2.1510735750198364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24588150531053543, "step": 4650 }, { "epoch": 0.09304, "grad_norm": 2.28125, "grad_norm_var": 0.049397786458333336, "learning_rate": 0.0001, "loss": 4.579, "loss/crossentropy": 2.086448848247528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.261296346783638, "step": 4652 }, { "epoch": 0.09308, "grad_norm": 2.5625, "grad_norm_var": 0.0446685791015625, "learning_rate": 0.0001, "loss": 4.7495, "loss/crossentropy": 1.5951193571090698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2312234491109848, "step": 4654 }, { "epoch": 0.09312, "grad_norm": 2.328125, "grad_norm_var": 0.04052734375, "learning_rate": 0.0001, "loss": 4.2273, "loss/crossentropy": 2.0098360776901245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22354383766651154, "step": 4656 }, { "epoch": 0.09316, "grad_norm": 2.375, "grad_norm_var": 0.04079488118489583, "learning_rate": 0.0001, "loss": 4.6047, "loss/crossentropy": 2.263219714164734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26900506019592285, "step": 4658 }, { "epoch": 0.0932, "grad_norm": 2.828125, "grad_norm_var": 0.0485504150390625, "learning_rate": 0.0001, "loss": 4.5575, "loss/crossentropy": 1.9481555819511414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24665354192256927, "step": 4660 }, { "epoch": 0.09324, "grad_norm": 2.265625, "grad_norm_var": 0.04988505045572917, "learning_rate": 0.0001, "loss": 4.5895, "loss/crossentropy": 1.8751367926597595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24356309324502945, "step": 4662 }, { "epoch": 0.09328, "grad_norm": 2.28125, "grad_norm_var": 0.06110026041666667, "learning_rate": 0.0001, "loss": 4.7739, "loss/crossentropy": 2.343896746635437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2558425962924957, "step": 4664 }, { "epoch": 0.09332, "grad_norm": 2.40625, "grad_norm_var": 0.032811482747395836, "learning_rate": 0.0001, "loss": 4.779, "loss/crossentropy": 2.067797303199768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25030098110437393, "step": 4666 }, { "epoch": 0.09336, "grad_norm": 2.359375, "grad_norm_var": 0.029816691080729166, "learning_rate": 0.0001, "loss": 4.7427, "loss/crossentropy": 2.4221161603927612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26934675872325897, "step": 4668 }, { "epoch": 0.0934, "grad_norm": 2.8125, "grad_norm_var": 0.038182576497395836, "learning_rate": 0.0001, "loss": 4.845, "loss/crossentropy": 2.2733672857284546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536270022392273, "step": 4670 }, { "epoch": 0.09344, "grad_norm": 2.390625, "grad_norm_var": 0.0369537353515625, "learning_rate": 0.0001, "loss": 4.7721, "loss/crossentropy": 2.179157257080078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24323320388793945, "step": 4672 }, { "epoch": 0.09348, "grad_norm": 2.578125, "grad_norm_var": 0.03875325520833333, "learning_rate": 0.0001, "loss": 4.8574, "loss/crossentropy": 2.3613970279693604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2567686140537262, "step": 4674 }, { "epoch": 0.09352, "grad_norm": 2.5, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 4.787, "loss/crossentropy": 2.0755810141563416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2349473536014557, "step": 4676 }, { "epoch": 0.09356, "grad_norm": 2.25, "grad_norm_var": 0.03134358723958333, "learning_rate": 0.0001, "loss": 4.6235, "loss/crossentropy": 1.9971619248390198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22000454366207123, "step": 4678 }, { "epoch": 0.0936, "grad_norm": 2.3125, "grad_norm_var": 0.020068359375, "learning_rate": 0.0001, "loss": 4.5307, "loss/crossentropy": 1.9419977068901062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21052303910255432, "step": 4680 }, { "epoch": 0.09364, "grad_norm": 2.3125, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 4.6314, "loss/crossentropy": 2.1175334453582764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23567666858434677, "step": 4682 }, { "epoch": 0.09368, "grad_norm": 2.40625, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 4.5955, "loss/crossentropy": 2.0300605297088623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29177258908748627, "step": 4684 }, { "epoch": 0.09372, "grad_norm": 2.296875, "grad_norm_var": 0.009228515625, "learning_rate": 0.0001, "loss": 4.5583, "loss/crossentropy": 2.052473723888397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22480525821447372, "step": 4686 }, { "epoch": 0.09376, "grad_norm": 2.359375, "grad_norm_var": 0.008275349934895834, "learning_rate": 0.0001, "loss": 4.4925, "loss/crossentropy": 2.2506834268569946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26876674592494965, "step": 4688 }, { "epoch": 0.0938, "grad_norm": 2.125, "grad_norm_var": 0.007515462239583334, "learning_rate": 0.0001, "loss": 4.291, "loss/crossentropy": 1.930393099784851, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22246946394443512, "step": 4690 }, { "epoch": 0.09384, "grad_norm": 2.5, "grad_norm_var": 0.007616170247395833, "learning_rate": 0.0001, "loss": 4.9317, "loss/crossentropy": 2.5477651357650757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27808643877506256, "step": 4692 }, { "epoch": 0.09388, "grad_norm": 2.15625, "grad_norm_var": 0.009033203125, "learning_rate": 0.0001, "loss": 4.5313, "loss/crossentropy": 2.1059221625328064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.253792941570282, "step": 4694 }, { "epoch": 0.09392, "grad_norm": 2.265625, "grad_norm_var": 0.0091217041015625, "learning_rate": 0.0001, "loss": 4.607, "loss/crossentropy": 2.0058358907699585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23942459374666214, "step": 4696 }, { "epoch": 0.09396, "grad_norm": 2.25, "grad_norm_var": 0.0103515625, "learning_rate": 0.0001, "loss": 4.2024, "loss/crossentropy": 1.9118947982788086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2300974577665329, "step": 4698 }, { "epoch": 0.094, "grad_norm": 2.5625, "grad_norm_var": 0.013337198893229167, "learning_rate": 0.0001, "loss": 4.6541, "loss/crossentropy": 2.1172733902931213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24785596132278442, "step": 4700 }, { "epoch": 0.09404, "grad_norm": 2.5, "grad_norm_var": 0.016044108072916667, "learning_rate": 0.0001, "loss": 4.9005, "loss/crossentropy": 2.0064170956611633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25045372545719147, "step": 4702 }, { "epoch": 0.09408, "grad_norm": 2.3125, "grad_norm_var": 0.015738932291666667, "learning_rate": 0.0001, "loss": 4.8139, "loss/crossentropy": 2.5816656351089478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27938568592071533, "step": 4704 }, { "epoch": 0.09412, "grad_norm": 2.25, "grad_norm_var": 0.016597493489583334, "learning_rate": 0.0001, "loss": 4.866, "loss/crossentropy": 2.5768171548843384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27975398302078247, "step": 4706 }, { "epoch": 0.09416, "grad_norm": 2.234375, "grad_norm_var": 0.0153717041015625, "learning_rate": 0.0001, "loss": 4.6075, "loss/crossentropy": 2.323893189430237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2608431279659271, "step": 4708 }, { "epoch": 0.0942, "grad_norm": 2.265625, "grad_norm_var": 0.01357421875, "learning_rate": 0.0001, "loss": 4.3564, "loss/crossentropy": 1.7910810708999634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21380099654197693, "step": 4710 }, { "epoch": 0.09424, "grad_norm": 2.21875, "grad_norm_var": 0.015848795572916668, "learning_rate": 0.0001, "loss": 4.6374, "loss/crossentropy": 2.214052677154541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24833911657333374, "step": 4712 }, { "epoch": 0.09428, "grad_norm": 2.40625, "grad_norm_var": 0.0158355712890625, "learning_rate": 0.0001, "loss": 4.9987, "loss/crossentropy": 2.0850380063056946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419341504573822, "step": 4714 }, { "epoch": 0.09432, "grad_norm": 2.359375, "grad_norm_var": 0.014058430989583334, "learning_rate": 0.0001, "loss": 4.2961, "loss/crossentropy": 2.0707927346229553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2499033510684967, "step": 4716 }, { "epoch": 0.09436, "grad_norm": 2.3125, "grad_norm_var": 0.0125396728515625, "learning_rate": 0.0001, "loss": 4.4057, "loss/crossentropy": 2.11221444606781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25916673243045807, "step": 4718 }, { "epoch": 0.0944, "grad_norm": 2.390625, "grad_norm_var": 0.012718709309895833, "learning_rate": 0.0001, "loss": 4.6057, "loss/crossentropy": 1.8400230407714844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22771906107664108, "step": 4720 }, { "epoch": 0.09444, "grad_norm": 2.28125, "grad_norm_var": 0.008234659830729166, "learning_rate": 0.0001, "loss": 4.4929, "loss/crossentropy": 2.2271865606307983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2574647441506386, "step": 4722 }, { "epoch": 0.09448, "grad_norm": 2.203125, "grad_norm_var": 0.008649698893229167, "learning_rate": 0.0001, "loss": 4.4629, "loss/crossentropy": 2.4532920122146606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27140843868255615, "step": 4724 }, { "epoch": 0.09452, "grad_norm": 2.375, "grad_norm_var": 0.009504191080729167, "learning_rate": 0.0001, "loss": 4.731, "loss/crossentropy": 1.951303780078888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2333502620458603, "step": 4726 }, { "epoch": 0.09456, "grad_norm": 2.40625, "grad_norm_var": 0.009137980143229167, "learning_rate": 0.0001, "loss": 4.6334, "loss/crossentropy": 2.209821343421936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24776015430688858, "step": 4728 }, { "epoch": 0.0946, "grad_norm": 2.3125, "grad_norm_var": 0.010872395833333333, "learning_rate": 0.0001, "loss": 4.861, "loss/crossentropy": 2.434941530227661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2747645229101181, "step": 4730 }, { "epoch": 0.09464, "grad_norm": 2.1875, "grad_norm_var": 0.010602823893229167, "learning_rate": 0.0001, "loss": 4.6145, "loss/crossentropy": 2.051652252674103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24515582621097565, "step": 4732 }, { "epoch": 0.09468, "grad_norm": 2.265625, "grad_norm_var": 0.009797159830729167, "learning_rate": 0.0001, "loss": 4.6804, "loss/crossentropy": 2.0039377212524414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23073314130306244, "step": 4734 }, { "epoch": 0.09472, "grad_norm": 2.4375, "grad_norm_var": 0.19903971354166666, "learning_rate": 0.0001, "loss": 4.8333, "loss/crossentropy": 2.166410982608795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25871724635362625, "step": 4736 }, { "epoch": 0.09476, "grad_norm": 2.3125, "grad_norm_var": 0.2066802978515625, "learning_rate": 0.0001, "loss": 4.2739, "loss/crossentropy": 1.9289153218269348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23183659464120865, "step": 4738 }, { "epoch": 0.0948, "grad_norm": 2.46875, "grad_norm_var": 0.20640360514322917, "learning_rate": 0.0001, "loss": 4.5903, "loss/crossentropy": 2.270771861076355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26335832476615906, "step": 4740 }, { "epoch": 0.09484, "grad_norm": 2.4375, "grad_norm_var": 0.206494140625, "learning_rate": 0.0001, "loss": 4.7142, "loss/crossentropy": 2.395651936531067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28527122735977173, "step": 4742 }, { "epoch": 0.09488, "grad_norm": 2.390625, "grad_norm_var": 0.20437723795572918, "learning_rate": 0.0001, "loss": 4.5083, "loss/crossentropy": 1.8597867488861084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224149189889431, "step": 4744 }, { "epoch": 0.09492, "grad_norm": 2.265625, "grad_norm_var": 0.20693257649739583, "learning_rate": 0.0001, "loss": 4.4415, "loss/crossentropy": 1.7795116305351257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23233074694871902, "step": 4746 }, { "epoch": 0.09496, "grad_norm": 2.4375, "grad_norm_var": 0.20255533854166666, "learning_rate": 0.0001, "loss": 4.7749, "loss/crossentropy": 1.9449282884597778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2524118423461914, "step": 4748 }, { "epoch": 0.095, "grad_norm": 2.53125, "grad_norm_var": 0.2005035400390625, "learning_rate": 0.0001, "loss": 4.7047, "loss/crossentropy": 2.195169448852539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2349853590130806, "step": 4750 }, { "epoch": 0.09504, "grad_norm": 2.671875, "grad_norm_var": 0.022468058268229167, "learning_rate": 0.0001, "loss": 4.5268, "loss/crossentropy": 1.6628928184509277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975274682044983, "step": 4752 }, { "epoch": 0.09508, "grad_norm": 2.25, "grad_norm_var": 0.018488566080729168, "learning_rate": 0.0001, "loss": 4.5862, "loss/crossentropy": 2.0991236567497253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23341374844312668, "step": 4754 }, { "epoch": 0.09512, "grad_norm": 2.40625, "grad_norm_var": 0.017024739583333334, "learning_rate": 0.0001, "loss": 4.8338, "loss/crossentropy": 2.350286066532135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2648291736841202, "step": 4756 }, { "epoch": 0.09516, "grad_norm": 2.375, "grad_norm_var": 0.017430623372395832, "learning_rate": 0.0001, "loss": 4.5788, "loss/crossentropy": 2.02384877204895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24653150886297226, "step": 4758 }, { "epoch": 0.0952, "grad_norm": 2.203125, "grad_norm_var": 0.01734619140625, "learning_rate": 0.0001, "loss": 4.7052, "loss/crossentropy": 2.264349341392517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25881223380565643, "step": 4760 }, { "epoch": 0.09524, "grad_norm": 2.890625, "grad_norm_var": 0.03779296875, "learning_rate": 0.0001, "loss": 4.8216, "loss/crossentropy": 2.197356939315796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565019279718399, "step": 4762 }, { "epoch": 0.09528, "grad_norm": 2.515625, "grad_norm_var": 0.038655598958333336, "learning_rate": 0.0001, "loss": 4.3849, "loss/crossentropy": 1.8794787526130676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21771979331970215, "step": 4764 }, { "epoch": 0.09532, "grad_norm": 2.3125, "grad_norm_var": 0.03795572916666667, "learning_rate": 0.0001, "loss": 4.7391, "loss/crossentropy": 2.107520580291748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2350333333015442, "step": 4766 }, { "epoch": 0.09536, "grad_norm": 2.3125, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 4.5194, "loss/crossentropy": 2.082249402999878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24860269576311111, "step": 4768 }, { "epoch": 0.0954, "grad_norm": 2.296875, "grad_norm_var": 0.03062744140625, "learning_rate": 0.0001, "loss": 4.3985, "loss/crossentropy": 1.9632289409637451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24099770188331604, "step": 4770 }, { "epoch": 0.09544, "grad_norm": 2.359375, "grad_norm_var": 0.02994384765625, "learning_rate": 0.0001, "loss": 4.4526, "loss/crossentropy": 2.3403327465057373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2573448717594147, "step": 4772 }, { "epoch": 0.09548, "grad_norm": 2.40625, "grad_norm_var": 0.03181050618489583, "learning_rate": 0.0001, "loss": 4.7711, "loss/crossentropy": 2.302455425262451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28050975501537323, "step": 4774 }, { "epoch": 0.09552, "grad_norm": 2.28125, "grad_norm_var": 0.031103515625, "learning_rate": 0.0001, "loss": 4.6092, "loss/crossentropy": 2.2875213623046875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23910009860992432, "step": 4776 }, { "epoch": 0.09556, "grad_norm": 2.765625, "grad_norm_var": 0.022721354166666666, "learning_rate": 0.0001, "loss": 4.5237, "loss/crossentropy": 2.0440263748168945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24640783667564392, "step": 4778 }, { "epoch": 0.0956, "grad_norm": 2.53125, "grad_norm_var": 0.02613525390625, "learning_rate": 0.0001, "loss": 4.6328, "loss/crossentropy": 2.1597142219543457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24566050618886948, "step": 4780 }, { "epoch": 0.09564, "grad_norm": 2.203125, "grad_norm_var": 0.027765909830729168, "learning_rate": 0.0001, "loss": 4.7138, "loss/crossentropy": 2.2846235036849976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598261833190918, "step": 4782 }, { "epoch": 0.09568, "grad_norm": 2.1875, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 4.4527, "loss/crossentropy": 2.032243251800537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2488306611776352, "step": 4784 }, { "epoch": 0.09572, "grad_norm": 2.5, "grad_norm_var": 0.037230428059895834, "learning_rate": 0.0001, "loss": 4.7651, "loss/crossentropy": 2.4999172687530518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25949685275554657, "step": 4786 }, { "epoch": 0.09576, "grad_norm": 2.296875, "grad_norm_var": 0.0375152587890625, "learning_rate": 0.0001, "loss": 4.6068, "loss/crossentropy": 2.1610575914382935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2414923906326294, "step": 4788 }, { "epoch": 0.0958, "grad_norm": 2.390625, "grad_norm_var": 0.03135477701822917, "learning_rate": 0.0001, "loss": 4.5309, "loss/crossentropy": 1.8679919838905334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21853071451187134, "step": 4790 }, { "epoch": 0.09584, "grad_norm": 2.390625, "grad_norm_var": 0.031281534830729166, "learning_rate": 0.0001, "loss": 4.697, "loss/crossentropy": 2.072615623474121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24943216145038605, "step": 4792 }, { "epoch": 0.09588, "grad_norm": 2.765625, "grad_norm_var": 0.032624308268229166, "learning_rate": 0.0001, "loss": 4.5555, "loss/crossentropy": 2.420115113258362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2840816229581833, "step": 4794 }, { "epoch": 0.09592, "grad_norm": 2.6875, "grad_norm_var": 0.0455078125, "learning_rate": 0.0001, "loss": 5.102, "loss/crossentropy": 2.2809360027313232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2739071249961853, "step": 4796 }, { "epoch": 0.09596, "grad_norm": 2.328125, "grad_norm_var": 0.04319559733072917, "learning_rate": 0.0001, "loss": 4.5418, "loss/crossentropy": 2.1809465289115906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24680403620004654, "step": 4798 }, { "epoch": 0.096, "grad_norm": 2.15625, "grad_norm_var": 0.044611612955729164, "learning_rate": 0.0001, "loss": 4.6994, "loss/crossentropy": 1.9710316061973572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598233222961426, "step": 4800 }, { "epoch": 0.09604, "grad_norm": 2.40625, "grad_norm_var": 0.035477701822916666, "learning_rate": 0.0001, "loss": 4.9811, "loss/crossentropy": 2.4378572702407837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2665309011936188, "step": 4802 }, { "epoch": 0.09608, "grad_norm": 2.390625, "grad_norm_var": 0.03759358723958333, "learning_rate": 0.0001, "loss": 4.604, "loss/crossentropy": 1.8578800559043884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24051420390605927, "step": 4804 }, { "epoch": 0.09612, "grad_norm": 2.40625, "grad_norm_var": 0.034520467122395836, "learning_rate": 0.0001, "loss": 4.4567, "loss/crossentropy": 2.191601037979126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25017624348402023, "step": 4806 }, { "epoch": 0.09616, "grad_norm": 2.296875, "grad_norm_var": 0.038374837239583334, "learning_rate": 0.0001, "loss": 4.5777, "loss/crossentropy": 2.2077550888061523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31426818668842316, "step": 4808 }, { "epoch": 0.0962, "grad_norm": 2.390625, "grad_norm_var": 0.030659993489583332, "learning_rate": 0.0001, "loss": 4.917, "loss/crossentropy": 2.2983503341674805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26998060941696167, "step": 4810 }, { "epoch": 0.09624, "grad_norm": 2.703125, "grad_norm_var": 0.020824178059895834, "learning_rate": 0.0001, "loss": 4.8691, "loss/crossentropy": 2.0875505208969116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2673826217651367, "step": 4812 }, { "epoch": 0.09628, "grad_norm": 2.03125, "grad_norm_var": 0.031403605143229166, "learning_rate": 0.0001, "loss": 4.0736, "loss/crossentropy": 1.9439318776130676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2379719465970993, "step": 4814 }, { "epoch": 0.09632, "grad_norm": 2.359375, "grad_norm_var": 0.03611551920572917, "learning_rate": 0.0001, "loss": 4.4698, "loss/crossentropy": 1.9822518229484558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24425340443849564, "step": 4816 }, { "epoch": 0.09636, "grad_norm": 2.296875, "grad_norm_var": 0.03902994791666667, "learning_rate": 0.0001, "loss": 4.475, "loss/crossentropy": 2.0238336324691772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2409110590815544, "step": 4818 }, { "epoch": 0.0964, "grad_norm": 2.34375, "grad_norm_var": 0.03707275390625, "learning_rate": 0.0001, "loss": 4.5401, "loss/crossentropy": 2.454450249671936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26127950847148895, "step": 4820 }, { "epoch": 0.09644, "grad_norm": 2.296875, "grad_norm_var": 0.038407389322916666, "learning_rate": 0.0001, "loss": 4.5682, "loss/crossentropy": 2.210579752922058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23662539571523666, "step": 4822 }, { "epoch": 0.09648, "grad_norm": 2.28125, "grad_norm_var": 0.034601847330729164, "learning_rate": 0.0001, "loss": 4.5527, "loss/crossentropy": 1.8737664222717285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22884630411863327, "step": 4824 }, { "epoch": 0.09652, "grad_norm": 2.484375, "grad_norm_var": 0.0335357666015625, "learning_rate": 0.0001, "loss": 4.6221, "loss/crossentropy": 2.1939562559127808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25145241618156433, "step": 4826 }, { "epoch": 0.09656, "grad_norm": 2.328125, "grad_norm_var": 0.0278472900390625, "learning_rate": 0.0001, "loss": 4.6685, "loss/crossentropy": 2.133625030517578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25248992443084717, "step": 4828 }, { "epoch": 0.0966, "grad_norm": 2.328125, "grad_norm_var": 0.0197906494140625, "learning_rate": 0.0001, "loss": 4.2854, "loss/crossentropy": 2.259738326072693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24727293848991394, "step": 4830 }, { "epoch": 0.09664, "grad_norm": 2.984375, "grad_norm_var": 0.043635050455729164, "learning_rate": 0.0001, "loss": 4.7497, "loss/crossentropy": 2.0186268091201782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27940231561660767, "step": 4832 }, { "epoch": 0.09668, "grad_norm": 2.234375, "grad_norm_var": 0.04185282389322917, "learning_rate": 0.0001, "loss": 4.5867, "loss/crossentropy": 1.9686395525932312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23021705448627472, "step": 4834 }, { "epoch": 0.09672, "grad_norm": 2.296875, "grad_norm_var": 0.0424224853515625, "learning_rate": 0.0001, "loss": 4.6157, "loss/crossentropy": 2.1677820682525635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2766892910003662, "step": 4836 }, { "epoch": 0.09676, "grad_norm": 2.1875, "grad_norm_var": 0.04670308430989583, "learning_rate": 0.0001, "loss": 4.3751, "loss/crossentropy": 1.8458788990974426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23294126987457275, "step": 4838 }, { "epoch": 0.0968, "grad_norm": 2.640625, "grad_norm_var": 0.0495269775390625, "learning_rate": 0.0001, "loss": 4.8931, "loss/crossentropy": 1.7898097038269043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23436476290225983, "step": 4840 }, { "epoch": 0.09684, "grad_norm": 2.375, "grad_norm_var": 0.04843343098958333, "learning_rate": 0.0001, "loss": 4.5169, "loss/crossentropy": 2.4594497680664062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2618473023176193, "step": 4842 }, { "epoch": 0.09688, "grad_norm": 2.4375, "grad_norm_var": 0.04752197265625, "learning_rate": 0.0001, "loss": 4.4841, "loss/crossentropy": 2.061558425426483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2523345798254013, "step": 4844 }, { "epoch": 0.09692, "grad_norm": 2.484375, "grad_norm_var": 0.04462890625, "learning_rate": 0.0001, "loss": 4.5606, "loss/crossentropy": 2.060658574104309, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2493506520986557, "step": 4846 }, { "epoch": 0.09696, "grad_norm": 2.25, "grad_norm_var": 0.016169230143229168, "learning_rate": 0.0001, "loss": 4.8028, "loss/crossentropy": 2.201029062271118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2629493921995163, "step": 4848 }, { "epoch": 0.097, "grad_norm": 2.34375, "grad_norm_var": 0.01519775390625, "learning_rate": 0.0001, "loss": 4.7404, "loss/crossentropy": 2.199273705482483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27054519951343536, "step": 4850 }, { "epoch": 0.09704, "grad_norm": 2.109375, "grad_norm_var": 0.0189453125, "learning_rate": 0.0001, "loss": 4.3008, "loss/crossentropy": 2.2466784715652466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2678111642599106, "step": 4852 }, { "epoch": 0.09708, "grad_norm": 2.71875, "grad_norm_var": 0.022054036458333332, "learning_rate": 0.0001, "loss": 4.6762, "loss/crossentropy": 1.9152815341949463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2441672906279564, "step": 4854 }, { "epoch": 0.09712, "grad_norm": 2.21875, "grad_norm_var": 0.017577107747395834, "learning_rate": 0.0001, "loss": 4.3818, "loss/crossentropy": 2.092438578605652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24061349034309387, "step": 4856 }, { "epoch": 0.09716, "grad_norm": 2.4375, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 4.3224, "loss/crossentropy": 1.8620384335517883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23738765716552734, "step": 4858 }, { "epoch": 0.0972, "grad_norm": 2.953125, "grad_norm_var": 0.04138997395833333, "learning_rate": 0.0001, "loss": 4.8798, "loss/crossentropy": 2.199701428413391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26521213352680206, "step": 4860 }, { "epoch": 0.09724, "grad_norm": 2.53125, "grad_norm_var": 0.042170206705729164, "learning_rate": 0.0001, "loss": 4.7598, "loss/crossentropy": 2.3637821674346924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2707534506917, "step": 4862 }, { "epoch": 0.09728, "grad_norm": 2.21875, "grad_norm_var": 0.045491536458333336, "learning_rate": 0.0001, "loss": 4.5184, "loss/crossentropy": 2.207235813140869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28080131858587265, "step": 4864 }, { "epoch": 0.09732, "grad_norm": 2.265625, "grad_norm_var": 0.0466461181640625, "learning_rate": 0.0001, "loss": 4.3798, "loss/crossentropy": 1.901595950126648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23459318280220032, "step": 4866 }, { "epoch": 0.09736, "grad_norm": 2.3125, "grad_norm_var": 0.042769368489583334, "learning_rate": 0.0001, "loss": 4.4289, "loss/crossentropy": 2.298312544822693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2580094337463379, "step": 4868 }, { "epoch": 0.0974, "grad_norm": 2.3125, "grad_norm_var": 0.03658854166666667, "learning_rate": 0.0001, "loss": 4.7576, "loss/crossentropy": 2.29680597782135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24749226868152618, "step": 4870 }, { "epoch": 0.09744, "grad_norm": 2.25, "grad_norm_var": 0.0361968994140625, "learning_rate": 0.0001, "loss": 4.5839, "loss/crossentropy": 2.1169378757476807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565944790840149, "step": 4872 }, { "epoch": 0.09748, "grad_norm": 2.140625, "grad_norm_var": 0.04035542805989583, "learning_rate": 0.0001, "loss": 4.5364, "loss/crossentropy": 2.0863184928894043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527791038155556, "step": 4874 }, { "epoch": 0.09752, "grad_norm": 2.296875, "grad_norm_var": 0.015625, "learning_rate": 0.0001, "loss": 4.6176, "loss/crossentropy": 2.146193563938141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23968011140823364, "step": 4876 }, { "epoch": 0.09756, "grad_norm": 2.375, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 4.5598, "loss/crossentropy": 2.1125508546829224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2498009204864502, "step": 4878 }, { "epoch": 0.0976, "grad_norm": 2.375, "grad_norm_var": 0.004130045572916667, "learning_rate": 0.0001, "loss": 4.772, "loss/crossentropy": 2.13166081905365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2640175372362137, "step": 4880 }, { "epoch": 0.09764, "grad_norm": 2.15625, "grad_norm_var": 0.0051910400390625, "learning_rate": 0.0001, "loss": 4.5276, "loss/crossentropy": 2.047860622406006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23980721086263657, "step": 4882 }, { "epoch": 0.09768, "grad_norm": 2.515625, "grad_norm_var": 0.010477701822916666, "learning_rate": 0.0001, "loss": 4.8519, "loss/crossentropy": 2.362569808959961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27908293902873993, "step": 4884 }, { "epoch": 0.09772, "grad_norm": 2.109375, "grad_norm_var": 0.012398274739583333, "learning_rate": 0.0001, "loss": 4.2709, "loss/crossentropy": 1.8781500458717346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22341010719537735, "step": 4886 }, { "epoch": 0.09776, "grad_norm": 2.390625, "grad_norm_var": 0.019807942708333335, "learning_rate": 0.0001, "loss": 4.9408, "loss/crossentropy": 2.1138893365859985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2533891350030899, "step": 4888 }, { "epoch": 0.0978, "grad_norm": 2.484375, "grad_norm_var": 0.0188140869140625, "learning_rate": 0.0001, "loss": 4.5981, "loss/crossentropy": 2.2212090492248535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.279159352183342, "step": 4890 }, { "epoch": 0.09784, "grad_norm": 2.453125, "grad_norm_var": 0.0177886962890625, "learning_rate": 0.0001, "loss": 4.7245, "loss/crossentropy": 1.9747707843780518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24078013002872467, "step": 4892 }, { "epoch": 0.09788, "grad_norm": 2.53125, "grad_norm_var": 0.020015462239583334, "learning_rate": 0.0001, "loss": 4.4051, "loss/crossentropy": 1.8607316613197327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25342857837677, "step": 4894 }, { "epoch": 0.09792, "grad_norm": 2.359375, "grad_norm_var": 0.020963541666666665, "learning_rate": 0.0001, "loss": 4.6759, "loss/crossentropy": 2.245271682739258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24260863661766052, "step": 4896 }, { "epoch": 0.09796, "grad_norm": 2.390625, "grad_norm_var": 0.017560831705729165, "learning_rate": 0.0001, "loss": 4.5006, "loss/crossentropy": 2.0503702759742737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25429168343544006, "step": 4898 }, { "epoch": 0.098, "grad_norm": 2.3125, "grad_norm_var": 0.017236328125, "learning_rate": 0.0001, "loss": 4.421, "loss/crossentropy": 1.7784077525138855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2249627709388733, "step": 4900 }, { "epoch": 0.09804, "grad_norm": 2.390625, "grad_norm_var": 0.0114898681640625, "learning_rate": 0.0001, "loss": 4.5398, "loss/crossentropy": 1.9827336072921753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22001181542873383, "step": 4902 }, { "epoch": 0.09808, "grad_norm": 2.53125, "grad_norm_var": 1.638996378580729, "learning_rate": 0.0001, "loss": 4.8149, "loss/crossentropy": 2.1003565788269043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2550586014986038, "step": 4904 }, { "epoch": 0.09812, "grad_norm": 2.484375, "grad_norm_var": 1.6366933186848958, "learning_rate": 0.0001, "loss": 4.5099, "loss/crossentropy": 1.9565055966377258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25481177121400833, "step": 4906 }, { "epoch": 0.09816, "grad_norm": 2.234375, "grad_norm_var": 1.6479777018229167, "learning_rate": 0.0001, "loss": 4.6214, "loss/crossentropy": 2.1693456172943115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23336851596832275, "step": 4908 }, { "epoch": 0.0982, "grad_norm": 2.40625, "grad_norm_var": 1.6428995768229167, "learning_rate": 0.0001, "loss": 4.7637, "loss/crossentropy": 2.050541341304779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25821831077337265, "step": 4910 }, { "epoch": 0.09824, "grad_norm": 2.40625, "grad_norm_var": 1.62906494140625, "learning_rate": 0.0001, "loss": 4.8515, "loss/crossentropy": 2.168497681617737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2674206495285034, "step": 4912 }, { "epoch": 0.09828, "grad_norm": 2.609375, "grad_norm_var": 1.6097320556640624, "learning_rate": 0.0001, "loss": 5.1667, "loss/crossentropy": 2.147577404975891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26900771260261536, "step": 4914 }, { "epoch": 0.09832, "grad_norm": 2.09375, "grad_norm_var": 1.6371734619140625, "learning_rate": 0.0001, "loss": 4.3958, "loss/crossentropy": 2.4436198472976685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2504816800355911, "step": 4916 }, { "epoch": 0.09836, "grad_norm": 2.25, "grad_norm_var": 1.6447265625, "learning_rate": 0.0001, "loss": 4.7244, "loss/crossentropy": 2.097359538078308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24005448818206787, "step": 4918 }, { "epoch": 0.0984, "grad_norm": 2.078125, "grad_norm_var": 0.0267974853515625, "learning_rate": 0.0001, "loss": 4.2305, "loss/crossentropy": 1.8467384576797485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22932368516921997, "step": 4920 }, { "epoch": 0.09844, "grad_norm": 2.859375, "grad_norm_var": 0.04263916015625, "learning_rate": 0.0001, "loss": 4.4407, "loss/crossentropy": 2.1454135179519653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2634875178337097, "step": 4922 }, { "epoch": 0.09848, "grad_norm": 2.765625, "grad_norm_var": 0.05164286295572917, "learning_rate": 0.0001, "loss": 4.5775, "loss/crossentropy": 1.9837967157363892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23980768024921417, "step": 4924 }, { "epoch": 0.09852, "grad_norm": 2.265625, "grad_norm_var": 0.054032389322916666, "learning_rate": 0.0001, "loss": 4.6888, "loss/crossentropy": 2.099667489528656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23482084274291992, "step": 4926 }, { "epoch": 0.09856, "grad_norm": 3.125, "grad_norm_var": 0.08787333170572917, "learning_rate": 0.0001, "loss": 4.5685, "loss/crossentropy": 1.8467332124710083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.256004735827446, "step": 4928 }, { "epoch": 0.0986, "grad_norm": 2.53125, "grad_norm_var": 0.08548075358072917, "learning_rate": 0.0001, "loss": 4.4474, "loss/crossentropy": 2.036003887653351, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2377806007862091, "step": 4930 }, { "epoch": 0.09864, "grad_norm": 2.484375, "grad_norm_var": 0.07834370930989583, "learning_rate": 0.0001, "loss": 4.8258, "loss/crossentropy": 2.091457724571228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24746537953615189, "step": 4932 }, { "epoch": 0.09868, "grad_norm": 2.296875, "grad_norm_var": 0.07929280598958334, "learning_rate": 0.0001, "loss": 4.6583, "loss/crossentropy": 2.245227336883545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24141517281532288, "step": 4934 }, { "epoch": 0.09872, "grad_norm": 2.40625, "grad_norm_var": 0.0661285400390625, "learning_rate": 0.0001, "loss": 4.6514, "loss/crossentropy": 2.295682668685913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28035247325897217, "step": 4936 }, { "epoch": 0.09876, "grad_norm": 2.234375, "grad_norm_var": 0.05524800618489583, "learning_rate": 0.0001, "loss": 4.5525, "loss/crossentropy": 1.9251704812049866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23609354346990585, "step": 4938 }, { "epoch": 0.0988, "grad_norm": 2.5, "grad_norm_var": 0.048314412434895836, "learning_rate": 0.0001, "loss": 5.1583, "loss/crossentropy": 2.289652466773987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30698196589946747, "step": 4940 }, { "epoch": 0.09884, "grad_norm": 2.09375, "grad_norm_var": 0.0534820556640625, "learning_rate": 0.0001, "loss": 4.7759, "loss/crossentropy": 2.3339043855667114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2449272722005844, "step": 4942 }, { "epoch": 0.09888, "grad_norm": 2.203125, "grad_norm_var": 0.01949462890625, "learning_rate": 0.0001, "loss": 4.6678, "loss/crossentropy": 2.0610195994377136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2553517669439316, "step": 4944 }, { "epoch": 0.09892, "grad_norm": 2.203125, "grad_norm_var": 0.018122355143229168, "learning_rate": 0.0001, "loss": 4.5849, "loss/crossentropy": 2.1093825101852417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24767974764108658, "step": 4946 }, { "epoch": 0.09896, "grad_norm": 2.328125, "grad_norm_var": 0.01416015625, "learning_rate": 0.0001, "loss": 4.3631, "loss/crossentropy": 2.1742878556251526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24640457332134247, "step": 4948 }, { "epoch": 0.099, "grad_norm": 2.390625, "grad_norm_var": 0.0144195556640625, "learning_rate": 0.0001, "loss": 4.8319, "loss/crossentropy": 2.4237486124038696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2610916793346405, "step": 4950 }, { "epoch": 0.09904, "grad_norm": 2.15625, "grad_norm_var": 0.015445963541666666, "learning_rate": 0.0001, "loss": 4.3397, "loss/crossentropy": 2.197754144668579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275979220867157, "step": 4952 }, { "epoch": 0.09908, "grad_norm": 2.21875, "grad_norm_var": 0.01529541015625, "learning_rate": 0.0001, "loss": 4.4533, "loss/crossentropy": 2.267225503921509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23391032963991165, "step": 4954 }, { "epoch": 0.09912, "grad_norm": 2.265625, "grad_norm_var": 0.009504191080729167, "learning_rate": 0.0001, "loss": 4.47, "loss/crossentropy": 2.04893159866333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23763196170330048, "step": 4956 }, { "epoch": 0.09916, "grad_norm": 2.4375, "grad_norm_var": 0.010716756184895834, "learning_rate": 0.0001, "loss": 4.97, "loss/crossentropy": 2.4489223957061768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27698180079460144, "step": 4958 }, { "epoch": 0.0992, "grad_norm": 2.328125, "grad_norm_var": 0.011930338541666667, "learning_rate": 0.0001, "loss": 4.8604, "loss/crossentropy": 2.3654375076293945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2495984137058258, "step": 4960 }, { "epoch": 0.09924, "grad_norm": 2.328125, "grad_norm_var": 0.011812337239583333, "learning_rate": 0.0001, "loss": 4.7671, "loss/crossentropy": 1.8583308458328247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21832667291164398, "step": 4962 }, { "epoch": 0.09928, "grad_norm": 2.265625, "grad_norm_var": 0.01177978515625, "learning_rate": 0.0001, "loss": 4.4951, "loss/crossentropy": 2.0762425661087036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280900850892067, "step": 4964 }, { "epoch": 0.09932, "grad_norm": 2.15625, "grad_norm_var": 0.012532552083333334, "learning_rate": 0.0001, "loss": 4.7461, "loss/crossentropy": 2.228640913963318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26595622301101685, "step": 4966 }, { "epoch": 0.09936, "grad_norm": 2.234375, "grad_norm_var": 0.0111236572265625, "learning_rate": 0.0001, "loss": 4.521, "loss/crossentropy": 2.1447466611862183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2542327791452408, "step": 4968 }, { "epoch": 0.0994, "grad_norm": 2.09375, "grad_norm_var": 0.01226806640625, "learning_rate": 0.0001, "loss": 4.3959, "loss/crossentropy": 2.0750887989997864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2362435683608055, "step": 4970 }, { "epoch": 0.09944, "grad_norm": 2.328125, "grad_norm_var": 0.009666951497395833, "learning_rate": 0.0001, "loss": 4.6705, "loss/crossentropy": 1.9413353204727173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24418477714061737, "step": 4972 }, { "epoch": 0.09948, "grad_norm": 2.265625, "grad_norm_var": 0.00904541015625, "learning_rate": 0.0001, "loss": 4.609, "loss/crossentropy": 2.102766752243042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24170882254838943, "step": 4974 }, { "epoch": 0.09952, "grad_norm": 2.359375, "grad_norm_var": 0.0090240478515625, "learning_rate": 0.0001, "loss": 4.7568, "loss/crossentropy": 2.431061267852783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2855495512485504, "step": 4976 }, { "epoch": 0.09956, "grad_norm": 2.453125, "grad_norm_var": 0.010448201497395834, "learning_rate": 0.0001, "loss": 4.7449, "loss/crossentropy": 1.9656312465667725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26410341262817383, "step": 4978 }, { "epoch": 0.0996, "grad_norm": 2.578125, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 5.0286, "loss/crossentropy": 2.2365923523902893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27946531772613525, "step": 4980 }, { "epoch": 0.09964, "grad_norm": 2.4375, "grad_norm_var": 0.015184529622395833, "learning_rate": 0.0001, "loss": 4.5791, "loss/crossentropy": 2.203595757484436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25247688591480255, "step": 4982 }, { "epoch": 0.09968, "grad_norm": 2.296875, "grad_norm_var": 0.0163482666015625, "learning_rate": 0.0001, "loss": 4.6824, "loss/crossentropy": 2.1462446451187134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2539386674761772, "step": 4984 }, { "epoch": 0.09972, "grad_norm": 2.25, "grad_norm_var": 0.014127604166666667, "learning_rate": 0.0001, "loss": 4.7121, "loss/crossentropy": 2.4518587589263916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24366125464439392, "step": 4986 }, { "epoch": 0.09976, "grad_norm": 2.25, "grad_norm_var": 0.01402587890625, "learning_rate": 0.0001, "loss": 4.5131, "loss/crossentropy": 2.003869950771332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22874485701322556, "step": 4988 }, { "epoch": 0.0998, "grad_norm": 2.328125, "grad_norm_var": 0.017476399739583332, "learning_rate": 0.0001, "loss": 4.3221, "loss/crossentropy": 2.0251912474632263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240670546889305, "step": 4990 }, { "epoch": 0.09984, "grad_norm": 2.140625, "grad_norm_var": 0.020166015625, "learning_rate": 0.0001, "loss": 4.5297, "loss/crossentropy": 2.199779748916626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22916750609874725, "step": 4992 }, { "epoch": 0.09988, "grad_norm": 2.40625, "grad_norm_var": 0.0207672119140625, "learning_rate": 0.0001, "loss": 4.8066, "loss/crossentropy": 2.3852288722991943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26489999890327454, "step": 4994 }, { "epoch": 0.09992, "grad_norm": 2.359375, "grad_norm_var": 0.03874409993489583, "learning_rate": 0.0001, "loss": 4.7245, "loss/crossentropy": 1.9446094632148743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24112944304943085, "step": 4996 }, { "epoch": 0.09996, "grad_norm": 2.234375, "grad_norm_var": 0.04269205729166667, "learning_rate": 0.0001, "loss": 4.7063, "loss/crossentropy": 2.585115671157837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26535044610500336, "step": 4998 }, { "epoch": 0.1, "grad_norm": 2.8125, "grad_norm_var": 1.14400634765625, "learning_rate": 0.0001, "loss": 4.6848, "loss/crossentropy": 1.9871427416801453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24747569859027863, "step": 5000 }, { "epoch": 0.10004, "grad_norm": 2.359375, "grad_norm_var": 1.1424967447916667, "learning_rate": 0.0001, "loss": 4.6058, "loss/crossentropy": 1.8981972336769104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24131165444850922, "step": 5002 }, { "epoch": 0.10008, "grad_norm": 2.28125, "grad_norm_var": 1.1522786458333334, "learning_rate": 0.0001, "loss": 4.5731, "loss/crossentropy": 2.323825240135193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2584778293967247, "step": 5004 }, { "epoch": 0.10012, "grad_norm": 2.390625, "grad_norm_var": 1.1363118489583333, "learning_rate": 0.0001, "loss": 4.7065, "loss/crossentropy": 1.728028118610382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2997850477695465, "step": 5006 }, { "epoch": 0.10016, "grad_norm": 2.28125, "grad_norm_var": 1.12437744140625, "learning_rate": 0.0001, "loss": 4.8001, "loss/crossentropy": 2.1486289501190186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25335805118083954, "step": 5008 }, { "epoch": 0.1002, "grad_norm": 2.5, "grad_norm_var": 1.1099761962890624, "learning_rate": 0.0001, "loss": 4.936, "loss/crossentropy": 2.3132145404815674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26033517718315125, "step": 5010 }, { "epoch": 0.10024, "grad_norm": 2.671875, "grad_norm_var": 1.1134928385416667, "learning_rate": 0.0001, "loss": 4.493, "loss/crossentropy": 1.9233656525611877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2471245899796486, "step": 5012 }, { "epoch": 0.10028, "grad_norm": 2.21875, "grad_norm_var": 1.1301747639973958, "learning_rate": 0.0001, "loss": 4.5221, "loss/crossentropy": 1.9435511827468872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24730819463729858, "step": 5014 }, { "epoch": 0.10032, "grad_norm": 2.1875, "grad_norm_var": 0.018928019205729167, "learning_rate": 0.0001, "loss": 4.3895, "loss/crossentropy": 2.2031294107437134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23213820159435272, "step": 5016 }, { "epoch": 0.10036, "grad_norm": 2.703125, "grad_norm_var": 0.0276763916015625, "learning_rate": 0.0001, "loss": 4.7247, "loss/crossentropy": 2.285850405693054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25809091329574585, "step": 5018 }, { "epoch": 0.1004, "grad_norm": 2.328125, "grad_norm_var": 0.025211588541666666, "learning_rate": 0.0001, "loss": 4.7697, "loss/crossentropy": 1.8660435676574707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2436714917421341, "step": 5020 }, { "epoch": 0.10044, "grad_norm": 2.421875, "grad_norm_var": 0.025877888997395834, "learning_rate": 0.0001, "loss": 4.3716, "loss/crossentropy": 2.0659420490264893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24248096346855164, "step": 5022 }, { "epoch": 0.10048, "grad_norm": 2.328125, "grad_norm_var": 0.030516560872395834, "learning_rate": 0.0001, "loss": 4.7093, "loss/crossentropy": 2.213133215904236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2423061951994896, "step": 5024 }, { "epoch": 0.10052, "grad_norm": 2.234375, "grad_norm_var": 0.0313385009765625, "learning_rate": 0.0001, "loss": 4.5288, "loss/crossentropy": 2.3052343130111694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24757324904203415, "step": 5026 }, { "epoch": 0.10056, "grad_norm": 2.796875, "grad_norm_var": 0.039094034830729166, "learning_rate": 0.0001, "loss": 4.8709, "loss/crossentropy": 2.226990580558777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25892695784568787, "step": 5028 }, { "epoch": 0.1006, "grad_norm": 2.71875, "grad_norm_var": 0.04163004557291667, "learning_rate": 0.0001, "loss": 4.9444, "loss/crossentropy": 2.3460742235183716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27760128676891327, "step": 5030 }, { "epoch": 0.10064, "grad_norm": 2.125, "grad_norm_var": 0.04345703125, "learning_rate": 0.0001, "loss": 4.3597, "loss/crossentropy": 1.9782095551490784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2349972277879715, "step": 5032 }, { "epoch": 0.10068, "grad_norm": 2.96875, "grad_norm_var": 0.06148681640625, "learning_rate": 0.0001, "loss": 4.5602, "loss/crossentropy": 1.847929298877716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22401423752307892, "step": 5034 }, { "epoch": 0.10072, "grad_norm": 2.46875, "grad_norm_var": 0.06073811848958333, "learning_rate": 0.0001, "loss": 4.4989, "loss/crossentropy": 2.172071158885956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2656550332903862, "step": 5036 }, { "epoch": 0.10076, "grad_norm": 2.234375, "grad_norm_var": 0.07026265462239584, "learning_rate": 0.0001, "loss": 4.3892, "loss/crossentropy": 2.3497499227523804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26085225492715836, "step": 5038 }, { "epoch": 0.1008, "grad_norm": 2.328125, "grad_norm_var": 0.07284749348958333, "learning_rate": 0.0001, "loss": 4.2583, "loss/crossentropy": 2.0916348695755005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21129868924617767, "step": 5040 }, { "epoch": 0.10084, "grad_norm": 2.59375, "grad_norm_var": 0.07073160807291666, "learning_rate": 0.0001, "loss": 4.8931, "loss/crossentropy": 2.243077278137207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2657035142183304, "step": 5042 }, { "epoch": 0.10088, "grad_norm": 10.0, "grad_norm_var": 3.600194295247396, "learning_rate": 0.0001, "loss": 4.8887, "loss/crossentropy": 1.9361066222190857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41255413740873337, "step": 5044 }, { "epoch": 0.10092, "grad_norm": 3.703125, "grad_norm_var": 3.6162760416666666, "learning_rate": 0.0001, "loss": 4.7632, "loss/crossentropy": 2.0139313340187073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2591235190629959, "step": 5046 }, { "epoch": 0.10096, "grad_norm": 2.203125, "grad_norm_var": 3.589704386393229, "learning_rate": 0.0001, "loss": 4.6642, "loss/crossentropy": 2.335710287094116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26457205414772034, "step": 5048 }, { "epoch": 0.101, "grad_norm": 2.34375, "grad_norm_var": 3.604325358072917, "learning_rate": 0.0001, "loss": 4.3329, "loss/crossentropy": 1.9269848465919495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23867928236722946, "step": 5050 }, { "epoch": 0.10104, "grad_norm": 2.34375, "grad_norm_var": 3.627415974934896, "learning_rate": 0.0001, "loss": 4.4982, "loss/crossentropy": 1.9732608795166016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24961821734905243, "step": 5052 }, { "epoch": 0.10108, "grad_norm": 2.359375, "grad_norm_var": 3.6287506103515623, "learning_rate": 0.0001, "loss": 4.7218, "loss/crossentropy": 2.2659696340560913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24580512940883636, "step": 5054 }, { "epoch": 0.10112, "grad_norm": 2.84375, "grad_norm_var": 3.57880859375, "learning_rate": 0.0001, "loss": 4.4318, "loss/crossentropy": 1.6600720882415771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20283473283052444, "step": 5056 }, { "epoch": 0.10116, "grad_norm": 2.375, "grad_norm_var": 3.602855428059896, "learning_rate": 0.0001, "loss": 4.8249, "loss/crossentropy": 2.0175185799598694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23329314589500427, "step": 5058 }, { "epoch": 0.1012, "grad_norm": 2.4375, "grad_norm_var": 0.146484375, "learning_rate": 0.0001, "loss": 4.5132, "loss/crossentropy": 2.259668231010437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2692077234387398, "step": 5060 }, { "epoch": 0.10124, "grad_norm": 2.46875, "grad_norm_var": 0.02144775390625, "learning_rate": 0.0001, "loss": 4.6392, "loss/crossentropy": 2.260953903198242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28225430846214294, "step": 5062 }, { "epoch": 0.10128, "grad_norm": 2.171875, "grad_norm_var": 0.021686808268229166, "learning_rate": 0.0001, "loss": 4.3624, "loss/crossentropy": 1.9721892476081848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2336483597755432, "step": 5064 }, { "epoch": 0.10132, "grad_norm": 2.40625, "grad_norm_var": 0.021512858072916665, "learning_rate": 0.0001, "loss": 4.6257, "loss/crossentropy": 2.188947319984436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23959602415561676, "step": 5066 }, { "epoch": 0.10136, "grad_norm": 3.078125, "grad_norm_var": 0.04843343098958333, "learning_rate": 0.0001, "loss": 4.5601, "loss/crossentropy": 1.7914190292358398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2234276980161667, "step": 5068 }, { "epoch": 0.1014, "grad_norm": 2.265625, "grad_norm_var": 0.049925740559895834, "learning_rate": 0.0001, "loss": 4.5806, "loss/crossentropy": 2.1124663949012756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22523616254329681, "step": 5070 }, { "epoch": 0.10144, "grad_norm": 2.359375, "grad_norm_var": 0.03764546712239583, "learning_rate": 0.0001, "loss": 4.7098, "loss/crossentropy": 2.1146361231803894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22483345866203308, "step": 5072 }, { "epoch": 0.10148, "grad_norm": 2.171875, "grad_norm_var": 0.04121805826822917, "learning_rate": 0.0001, "loss": 4.5076, "loss/crossentropy": 2.335755705833435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.256914846599102, "step": 5074 }, { "epoch": 0.10152, "grad_norm": 2.34375, "grad_norm_var": 0.04346415201822917, "learning_rate": 0.0001, "loss": 4.8665, "loss/crossentropy": 2.2911819219589233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26038119196891785, "step": 5076 }, { "epoch": 0.10156, "grad_norm": 2.171875, "grad_norm_var": 0.047587076822916664, "learning_rate": 0.0001, "loss": 4.592, "loss/crossentropy": 2.059769034385681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24004006385803223, "step": 5078 }, { "epoch": 0.1016, "grad_norm": 2.109375, "grad_norm_var": 0.04888916015625, "learning_rate": 0.0001, "loss": 4.5028, "loss/crossentropy": 2.141201138496399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22454539686441422, "step": 5080 }, { "epoch": 0.10164, "grad_norm": 2.421875, "grad_norm_var": 0.050146484375, "learning_rate": 0.0001, "loss": 4.7527, "loss/crossentropy": 1.9538633823394775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24477297067642212, "step": 5082 }, { "epoch": 0.10168, "grad_norm": 2.1875, "grad_norm_var": 0.00992431640625, "learning_rate": 0.0001, "loss": 4.4799, "loss/crossentropy": 2.1555078625679016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2425938919186592, "step": 5084 }, { "epoch": 0.10172, "grad_norm": 2.296875, "grad_norm_var": 0.009235636393229166, "learning_rate": 0.0001, "loss": 4.5583, "loss/crossentropy": 2.2306214570999146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2445632517337799, "step": 5086 }, { "epoch": 0.10176, "grad_norm": 2.03125, "grad_norm_var": 0.010179646809895833, "learning_rate": 0.0001, "loss": 4.1075, "loss/crossentropy": 1.9713392853736877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22239256650209427, "step": 5088 }, { "epoch": 0.1018, "grad_norm": 2.234375, "grad_norm_var": 0.011799112955729166, "learning_rate": 0.0001, "loss": 4.5181, "loss/crossentropy": 1.951128602027893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24310563504695892, "step": 5090 }, { "epoch": 0.10184, "grad_norm": 2.28125, "grad_norm_var": 0.011051432291666666, "learning_rate": 0.0001, "loss": 4.3617, "loss/crossentropy": 2.0100057125091553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23054375499486923, "step": 5092 }, { "epoch": 0.10188, "grad_norm": 2.140625, "grad_norm_var": 0.011042277018229166, "learning_rate": 0.0001, "loss": 4.4573, "loss/crossentropy": 2.1898789405822754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24956130981445312, "step": 5094 }, { "epoch": 0.10192, "grad_norm": 2.21875, "grad_norm_var": 0.021512858072916665, "learning_rate": 0.0001, "loss": 4.6576, "loss/crossentropy": 1.5666239857673645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20010025054216385, "step": 5096 }, { "epoch": 0.10196, "grad_norm": 2.296875, "grad_norm_var": 0.022468058268229167, "learning_rate": 0.0001, "loss": 4.2762, "loss/crossentropy": 1.884951651096344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2484818547964096, "step": 5098 }, { "epoch": 0.102, "grad_norm": 2.34375, "grad_norm_var": 0.022554524739583335, "learning_rate": 0.0001, "loss": 4.5207, "loss/crossentropy": 1.976080298423767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23380043357610703, "step": 5100 }, { "epoch": 0.10204, "grad_norm": 2.375, "grad_norm_var": 0.0229156494140625, "learning_rate": 0.0001, "loss": 4.592, "loss/crossentropy": 2.1262658834457397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2441401332616806, "step": 5102 }, { "epoch": 0.10208, "grad_norm": 2.515625, "grad_norm_var": 0.08055013020833333, "learning_rate": 0.0001, "loss": 4.2735, "loss/crossentropy": 1.7588757276535034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101762592792511, "step": 5104 }, { "epoch": 0.10212, "grad_norm": 2.578125, "grad_norm_var": 0.08059488932291667, "learning_rate": 0.0001, "loss": 4.6721, "loss/crossentropy": 2.292783260345459, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26928654313087463, "step": 5106 }, { "epoch": 0.10216, "grad_norm": 2.65625, "grad_norm_var": 0.4834218343098958, "learning_rate": 0.0001, "loss": 4.7475, "loss/crossentropy": 2.0670888423919678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2644767463207245, "step": 5108 }, { "epoch": 0.1022, "grad_norm": 2.1875, "grad_norm_var": 0.4729563395182292, "learning_rate": 0.0001, "loss": 4.4113, "loss/crossentropy": 2.0522598028182983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2410094290971756, "step": 5110 }, { "epoch": 0.10224, "grad_norm": 2.359375, "grad_norm_var": 0.4729075113932292, "learning_rate": 0.0001, "loss": 4.4914, "loss/crossentropy": 2.0756974816322327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27077721059322357, "step": 5112 }, { "epoch": 0.10228, "grad_norm": 2.3125, "grad_norm_var": 0.47226155598958336, "learning_rate": 0.0001, "loss": 4.6524, "loss/crossentropy": 2.1569767594337463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23582034558057785, "step": 5114 }, { "epoch": 0.10232, "grad_norm": 2.21875, "grad_norm_var": 0.4847819010416667, "learning_rate": 0.0001, "loss": 4.2821, "loss/crossentropy": 1.9736077785491943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23984025418758392, "step": 5116 }, { "epoch": 0.10236, "grad_norm": 2.3125, "grad_norm_var": 0.4942698160807292, "learning_rate": 0.0001, "loss": 4.3047, "loss/crossentropy": 2.1400066614151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2347380667924881, "step": 5118 }, { "epoch": 0.1024, "grad_norm": 2.171875, "grad_norm_var": 0.46923726399739585, "learning_rate": 0.0001, "loss": 4.3531, "loss/crossentropy": 1.989999234676361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22508147358894348, "step": 5120 }, { "epoch": 0.10244, "grad_norm": 2.5, "grad_norm_var": 0.4704498291015625, "learning_rate": 0.0001, "loss": 4.8817, "loss/crossentropy": 2.132390856742859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3074522316455841, "step": 5122 }, { "epoch": 0.10248, "grad_norm": 2.296875, "grad_norm_var": 0.011002604166666667, "learning_rate": 0.0001, "loss": 4.3606, "loss/crossentropy": 1.7906856536865234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126675397157669, "step": 5124 }, { "epoch": 0.10252, "grad_norm": 2.265625, "grad_norm_var": 0.0112457275390625, "learning_rate": 0.0001, "loss": 4.7045, "loss/crossentropy": 2.0576369762420654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27211636304855347, "step": 5126 }, { "epoch": 0.10256, "grad_norm": 2.265625, "grad_norm_var": 0.01099853515625, "learning_rate": 0.0001, "loss": 4.5402, "loss/crossentropy": 2.1174184679985046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2344469577074051, "step": 5128 }, { "epoch": 0.1026, "grad_norm": 2.28125, "grad_norm_var": 0.0090972900390625, "learning_rate": 0.0001, "loss": 4.7227, "loss/crossentropy": 1.9139717817306519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2362786829471588, "step": 5130 }, { "epoch": 0.10264, "grad_norm": 2.3125, "grad_norm_var": 0.008495076497395834, "learning_rate": 0.0001, "loss": 4.4801, "loss/crossentropy": 2.022357940673828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2307513877749443, "step": 5132 }, { "epoch": 0.10268, "grad_norm": 2.203125, "grad_norm_var": 0.007721964518229167, "learning_rate": 0.0001, "loss": 4.3963, "loss/crossentropy": 2.038477897644043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22871223092079163, "step": 5134 }, { "epoch": 0.10272, "grad_norm": 2.421875, "grad_norm_var": 0.007079060872395833, "learning_rate": 0.0001, "loss": 4.7283, "loss/crossentropy": 2.0895442962646484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23719681799411774, "step": 5136 }, { "epoch": 0.10276, "grad_norm": 2.5625, "grad_norm_var": 0.008820597330729167, "learning_rate": 0.0001, "loss": 4.7059, "loss/crossentropy": 2.17978835105896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2519204765558243, "step": 5138 }, { "epoch": 0.1028, "grad_norm": 2.46875, "grad_norm_var": 0.00953369140625, "learning_rate": 0.0001, "loss": 4.7318, "loss/crossentropy": 2.19089937210083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24376338720321655, "step": 5140 }, { "epoch": 0.10284, "grad_norm": 2.515625, "grad_norm_var": 0.011002604166666667, "learning_rate": 0.0001, "loss": 4.3961, "loss/crossentropy": 2.018259823322296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23707401752471924, "step": 5142 }, { "epoch": 0.10288, "grad_norm": 2.484375, "grad_norm_var": 0.012287394205729166, "learning_rate": 0.0001, "loss": 4.7556, "loss/crossentropy": 2.110401153564453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23409561812877655, "step": 5144 }, { "epoch": 0.10292, "grad_norm": 2.390625, "grad_norm_var": 0.012239583333333333, "learning_rate": 0.0001, "loss": 4.615, "loss/crossentropy": 2.2096832990646362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2580249160528183, "step": 5146 }, { "epoch": 0.10296, "grad_norm": 2.09375, "grad_norm_var": 0.015803019205729168, "learning_rate": 0.0001, "loss": 4.5218, "loss/crossentropy": 2.1825822591781616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23592937737703323, "step": 5148 }, { "epoch": 0.103, "grad_norm": 2.359375, "grad_norm_var": 0.01451416015625, "learning_rate": 0.0001, "loss": 4.6945, "loss/crossentropy": 2.2440234422683716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23741164803504944, "step": 5150 }, { "epoch": 0.10304, "grad_norm": 2.4375, "grad_norm_var": 0.015071614583333334, "learning_rate": 0.0001, "loss": 4.962, "loss/crossentropy": 2.3818799257278442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2642097622156143, "step": 5152 }, { "epoch": 0.10308, "grad_norm": 2.296875, "grad_norm_var": 0.014378865559895834, "learning_rate": 0.0001, "loss": 4.5706, "loss/crossentropy": 2.1905806064605713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26945509016513824, "step": 5154 }, { "epoch": 0.10312, "grad_norm": 2.140625, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 4.5678, "loss/crossentropy": 2.096913695335388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2540033459663391, "step": 5156 }, { "epoch": 0.10316, "grad_norm": 2.375, "grad_norm_var": 0.015721638997395832, "learning_rate": 0.0001, "loss": 4.5286, "loss/crossentropy": 1.7916489243507385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22504088282585144, "step": 5158 }, { "epoch": 0.1032, "grad_norm": 2.34375, "grad_norm_var": 0.014240519205729166, "learning_rate": 0.0001, "loss": 4.6256, "loss/crossentropy": 1.9366079568862915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25474467873573303, "step": 5160 }, { "epoch": 0.10324, "grad_norm": 2.40625, "grad_norm_var": 0.014533487955729167, "learning_rate": 0.0001, "loss": 4.5054, "loss/crossentropy": 2.0233771800994873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23394355177879333, "step": 5162 }, { "epoch": 0.10328, "grad_norm": 2.234375, "grad_norm_var": 0.010693359375, "learning_rate": 0.0001, "loss": 4.7803, "loss/crossentropy": 2.442312717437744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28390438854694366, "step": 5164 }, { "epoch": 0.10332, "grad_norm": 2.3125, "grad_norm_var": 0.014435831705729167, "learning_rate": 0.0001, "loss": 4.7849, "loss/crossentropy": 1.9547526836395264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2298036813735962, "step": 5166 }, { "epoch": 0.10336, "grad_norm": 2.25, "grad_norm_var": 0.018766276041666665, "learning_rate": 0.0001, "loss": 4.3225, "loss/crossentropy": 1.7974739074707031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21959447860717773, "step": 5168 }, { "epoch": 0.1034, "grad_norm": 2.328125, "grad_norm_var": 0.020075480143229168, "learning_rate": 0.0001, "loss": 4.5438, "loss/crossentropy": 1.9860564470291138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24661653488874435, "step": 5170 }, { "epoch": 0.10344, "grad_norm": 2.34375, "grad_norm_var": 0.019050089518229167, "learning_rate": 0.0001, "loss": 4.5084, "loss/crossentropy": 1.6198940873146057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20592768490314484, "step": 5172 }, { "epoch": 0.10348, "grad_norm": 2.25, "grad_norm_var": 0.019554646809895833, "learning_rate": 0.0001, "loss": 4.4831, "loss/crossentropy": 2.193474531173706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24437790364027023, "step": 5174 }, { "epoch": 0.10352, "grad_norm": 2.5, "grad_norm_var": 0.022557576497395832, "learning_rate": 0.0001, "loss": 4.9209, "loss/crossentropy": 2.221992254257202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2931511402130127, "step": 5176 }, { "epoch": 0.10356, "grad_norm": 2.28125, "grad_norm_var": 0.022272745768229168, "learning_rate": 0.0001, "loss": 4.4611, "loss/crossentropy": 2.006419837474823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2335921749472618, "step": 5178 }, { "epoch": 0.1036, "grad_norm": 2.171875, "grad_norm_var": 0.022175089518229166, "learning_rate": 0.0001, "loss": 4.4477, "loss/crossentropy": 2.2861804962158203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2549041658639908, "step": 5180 }, { "epoch": 0.10364, "grad_norm": 2.171875, "grad_norm_var": 0.014378865559895834, "learning_rate": 0.0001, "loss": 4.4079, "loss/crossentropy": 2.217998743057251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24741299450397491, "step": 5182 }, { "epoch": 0.10368, "grad_norm": 2.328125, "grad_norm_var": 0.0123931884765625, "learning_rate": 0.0001, "loss": 4.7344, "loss/crossentropy": 2.1875526905059814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23636415600776672, "step": 5184 }, { "epoch": 0.10372, "grad_norm": 2.078125, "grad_norm_var": 0.014850870768229166, "learning_rate": 0.0001, "loss": 4.4629, "loss/crossentropy": 1.8408135175704956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23278112709522247, "step": 5186 }, { "epoch": 0.10376, "grad_norm": 2.359375, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 4.6848, "loss/crossentropy": 1.7936646342277527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23859456181526184, "step": 5188 }, { "epoch": 0.1038, "grad_norm": 2.21875, "grad_norm_var": 0.014058430989583334, "learning_rate": 0.0001, "loss": 4.3825, "loss/crossentropy": 2.0800318717956543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23155340552330017, "step": 5190 }, { "epoch": 0.10384, "grad_norm": 2.265625, "grad_norm_var": 0.0066802978515625, "learning_rate": 0.0001, "loss": 4.9645, "loss/crossentropy": 2.277778387069702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25029345601797104, "step": 5192 }, { "epoch": 0.10388, "grad_norm": 2.09375, "grad_norm_var": 0.0081695556640625, "learning_rate": 0.0001, "loss": 4.2486, "loss/crossentropy": 1.9658478498458862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22655482590198517, "step": 5194 }, { "epoch": 0.10392, "grad_norm": 2.21875, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 4.7503, "loss/crossentropy": 2.214509129524231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.254493810236454, "step": 5196 }, { "epoch": 0.10396, "grad_norm": 2.21875, "grad_norm_var": 0.0102203369140625, "learning_rate": 0.0001, "loss": 4.3648, "loss/crossentropy": 1.9465742707252502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23288530111312866, "step": 5198 }, { "epoch": 0.104, "grad_norm": 2.21875, "grad_norm_var": 0.010277303059895833, "learning_rate": 0.0001, "loss": 4.3216, "loss/crossentropy": 2.062779188156128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21423730999231339, "step": 5200 }, { "epoch": 0.10404, "grad_norm": 2.390625, "grad_norm_var": 0.010350545247395834, "learning_rate": 0.0001, "loss": 4.1645, "loss/crossentropy": 1.777470588684082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22998665273189545, "step": 5202 }, { "epoch": 0.10408, "grad_norm": 2.46875, "grad_norm_var": 0.05396728515625, "learning_rate": 0.0001, "loss": 4.8979, "loss/crossentropy": 2.2505980730056763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30352361500263214, "step": 5204 }, { "epoch": 0.10412, "grad_norm": 3.359375, "grad_norm_var": 0.12148335774739584, "learning_rate": 0.0001, "loss": 4.4093, "loss/crossentropy": 1.8989517092704773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2210940569639206, "step": 5206 }, { "epoch": 0.10416, "grad_norm": 2.734375, "grad_norm_var": 0.12923177083333334, "learning_rate": 0.0001, "loss": 4.5535, "loss/crossentropy": 2.378798723220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25575730204582214, "step": 5208 }, { "epoch": 0.1042, "grad_norm": 2.34375, "grad_norm_var": 0.11901041666666666, "learning_rate": 0.0001, "loss": 4.6763, "loss/crossentropy": 1.7642006278038025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22105304896831512, "step": 5210 }, { "epoch": 0.10424, "grad_norm": 2.375, "grad_norm_var": 0.11607666015625, "learning_rate": 0.0001, "loss": 4.8945, "loss/crossentropy": 2.188746988773346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24192160367965698, "step": 5212 }, { "epoch": 0.10428, "grad_norm": 2.3125, "grad_norm_var": 0.11298421223958334, "learning_rate": 0.0001, "loss": 4.6001, "loss/crossentropy": 2.116630494594574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2879187613725662, "step": 5214 }, { "epoch": 0.10432, "grad_norm": 2.15625, "grad_norm_var": 0.11013895670572917, "learning_rate": 0.0001, "loss": 4.4932, "loss/crossentropy": 1.8329599499702454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21425092220306396, "step": 5216 }, { "epoch": 0.10436, "grad_norm": 2.296875, "grad_norm_var": 0.1064849853515625, "learning_rate": 0.0001, "loss": 4.3966, "loss/crossentropy": 2.1063259840011597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23704984784126282, "step": 5218 }, { "epoch": 0.1044, "grad_norm": 2.25, "grad_norm_var": 0.08238525390625, "learning_rate": 0.0001, "loss": 4.4821, "loss/crossentropy": 1.7994996309280396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21408653259277344, "step": 5220 }, { "epoch": 0.10444, "grad_norm": 2.65625, "grad_norm_var": 0.0281158447265625, "learning_rate": 0.0001, "loss": 4.2437, "loss/crossentropy": 1.880006492137909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24362730979919434, "step": 5222 }, { "epoch": 0.10448, "grad_norm": 2.25, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 4.2726, "loss/crossentropy": 2.010268449783325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22580985724925995, "step": 5224 }, { "epoch": 0.10452, "grad_norm": 2.3125, "grad_norm_var": 0.0177886962890625, "learning_rate": 0.0001, "loss": 4.6839, "loss/crossentropy": 2.057162046432495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23878254741430283, "step": 5226 }, { "epoch": 0.10456, "grad_norm": 2.21875, "grad_norm_var": 0.019603474934895834, "learning_rate": 0.0001, "loss": 4.3774, "loss/crossentropy": 2.0615930557250977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22890077531337738, "step": 5228 }, { "epoch": 0.1046, "grad_norm": 2.40625, "grad_norm_var": 0.020817057291666666, "learning_rate": 0.0001, "loss": 4.7693, "loss/crossentropy": 1.9013472199440002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22539222240447998, "step": 5230 }, { "epoch": 0.10464, "grad_norm": 2.296875, "grad_norm_var": 0.019482421875, "learning_rate": 0.0001, "loss": 4.5531, "loss/crossentropy": 2.239185929298401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24706681817770004, "step": 5232 }, { "epoch": 0.10468, "grad_norm": 2.234375, "grad_norm_var": 0.020540364583333335, "learning_rate": 0.0001, "loss": 4.3902, "loss/crossentropy": 1.9881523251533508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24434638023376465, "step": 5234 }, { "epoch": 0.10472, "grad_norm": 2.3125, "grad_norm_var": 0.020524088541666666, "learning_rate": 0.0001, "loss": 4.7908, "loss/crossentropy": 1.9529212713241577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2321469634771347, "step": 5236 }, { "epoch": 0.10476, "grad_norm": 2.40625, "grad_norm_var": 0.01041259765625, "learning_rate": 0.0001, "loss": 4.8045, "loss/crossentropy": 2.196424722671509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24054434895515442, "step": 5238 }, { "epoch": 0.1048, "grad_norm": 2.234375, "grad_norm_var": 0.009781901041666667, "learning_rate": 0.0001, "loss": 4.5895, "loss/crossentropy": 2.082987070083618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2760557308793068, "step": 5240 }, { "epoch": 0.10484, "grad_norm": 2.359375, "grad_norm_var": 0.011652628580729166, "learning_rate": 0.0001, "loss": 4.5961, "loss/crossentropy": 2.2369720935821533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25305214524269104, "step": 5242 }, { "epoch": 0.10488, "grad_norm": 2.1875, "grad_norm_var": 0.0119537353515625, "learning_rate": 0.0001, "loss": 4.7268, "loss/crossentropy": 2.3372031450271606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26765232533216476, "step": 5244 }, { "epoch": 0.10492, "grad_norm": 2.3125, "grad_norm_var": 0.0113677978515625, "learning_rate": 0.0001, "loss": 4.6148, "loss/crossentropy": 2.23944628238678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26508912444114685, "step": 5246 }, { "epoch": 0.10496, "grad_norm": 2.21875, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 4.3453, "loss/crossentropy": 2.2701858282089233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2551077604293823, "step": 5248 }, { "epoch": 0.105, "grad_norm": 2.234375, "grad_norm_var": 0.010887654622395833, "learning_rate": 0.0001, "loss": 4.6455, "loss/crossentropy": 2.293464779853821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25016437470912933, "step": 5250 }, { "epoch": 0.10504, "grad_norm": 2.46875, "grad_norm_var": 0.0109527587890625, "learning_rate": 0.0001, "loss": 4.5798, "loss/crossentropy": 2.3072171211242676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27824972569942474, "step": 5252 }, { "epoch": 0.10508, "grad_norm": 2.453125, "grad_norm_var": 0.010498046875, "learning_rate": 0.0001, "loss": 4.5948, "loss/crossentropy": 1.9855756759643555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2312464788556099, "step": 5254 }, { "epoch": 0.10512, "grad_norm": 2.3125, "grad_norm_var": 0.009129842122395834, "learning_rate": 0.0001, "loss": 4.8104, "loss/crossentropy": 1.9584077596664429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2386016771197319, "step": 5256 }, { "epoch": 0.10516, "grad_norm": 2.265625, "grad_norm_var": 0.010741170247395833, "learning_rate": 0.0001, "loss": 4.6103, "loss/crossentropy": 2.2184669375419617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25914129614830017, "step": 5258 }, { "epoch": 0.1052, "grad_norm": 2.1875, "grad_norm_var": 0.0122222900390625, "learning_rate": 0.0001, "loss": 4.1221, "loss/crossentropy": 1.6798554062843323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2037271112203598, "step": 5260 }, { "epoch": 0.10524, "grad_norm": 2.203125, "grad_norm_var": 0.011995442708333333, "learning_rate": 0.0001, "loss": 4.2718, "loss/crossentropy": 1.8675006031990051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21148693561553955, "step": 5262 }, { "epoch": 0.10528, "grad_norm": 2.390625, "grad_norm_var": 0.012336222330729167, "learning_rate": 0.0001, "loss": 4.5136, "loss/crossentropy": 1.9439855813980103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24611759185791016, "step": 5264 }, { "epoch": 0.10532, "grad_norm": 2.171875, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 4.5298, "loss/crossentropy": 2.1550748348236084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2428945228457451, "step": 5266 }, { "epoch": 0.10536, "grad_norm": 2.34375, "grad_norm_var": 0.0117584228515625, "learning_rate": 0.0001, "loss": 4.6777, "loss/crossentropy": 1.9924054741859436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21866007149219513, "step": 5268 }, { "epoch": 0.1054, "grad_norm": 2.40625, "grad_norm_var": 0.011571248372395834, "learning_rate": 0.0001, "loss": 4.5359, "loss/crossentropy": 2.0413920879364014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2579498365521431, "step": 5270 }, { "epoch": 0.10544, "grad_norm": 2.578125, "grad_norm_var": 0.016380818684895833, "learning_rate": 0.0001, "loss": 4.6406, "loss/crossentropy": 2.062632381916046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24655035883188248, "step": 5272 }, { "epoch": 0.10548, "grad_norm": 2.421875, "grad_norm_var": 0.021833292643229165, "learning_rate": 0.0001, "loss": 4.8133, "loss/crossentropy": 2.0620386600494385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3041655272245407, "step": 5274 }, { "epoch": 0.10552, "grad_norm": 2.234375, "grad_norm_var": 0.016657511393229168, "learning_rate": 0.0001, "loss": 4.4775, "loss/crossentropy": 1.966421365737915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22333864122629166, "step": 5276 }, { "epoch": 0.10556, "grad_norm": 2.25, "grad_norm_var": 0.017650349934895834, "learning_rate": 0.0001, "loss": 4.4323, "loss/crossentropy": 1.9120238423347473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22857370972633362, "step": 5278 }, { "epoch": 0.1056, "grad_norm": 2.40625, "grad_norm_var": 0.019701131184895835, "learning_rate": 0.0001, "loss": 4.5179, "loss/crossentropy": 2.084389805793762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2556762397289276, "step": 5280 }, { "epoch": 0.10564, "grad_norm": 2.328125, "grad_norm_var": 0.019758097330729165, "learning_rate": 0.0001, "loss": 4.4552, "loss/crossentropy": 1.8707188367843628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22955116629600525, "step": 5282 }, { "epoch": 0.10568, "grad_norm": 2.265625, "grad_norm_var": 0.019775390625, "learning_rate": 0.0001, "loss": 4.3538, "loss/crossentropy": 1.8243692517280579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21088172495365143, "step": 5284 }, { "epoch": 0.10572, "grad_norm": 2.359375, "grad_norm_var": 0.0192535400390625, "learning_rate": 0.0001, "loss": 4.5046, "loss/crossentropy": 2.111305356025696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2595943957567215, "step": 5286 }, { "epoch": 0.10576, "grad_norm": 2.15625, "grad_norm_var": 0.015697224934895834, "learning_rate": 0.0001, "loss": 4.4598, "loss/crossentropy": 2.3729283809661865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2632211595773697, "step": 5288 }, { "epoch": 0.1058, "grad_norm": 2.140625, "grad_norm_var": 0.007249959309895833, "learning_rate": 0.0001, "loss": 4.6256, "loss/crossentropy": 2.3542696237564087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25645585358142853, "step": 5290 }, { "epoch": 0.10584, "grad_norm": 2.4375, "grad_norm_var": 0.01002197265625, "learning_rate": 0.0001, "loss": 4.4863, "loss/crossentropy": 2.0140068531036377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23033145815134048, "step": 5292 }, { "epoch": 0.10588, "grad_norm": 2.375, "grad_norm_var": 0.0099761962890625, "learning_rate": 0.0001, "loss": 4.8126, "loss/crossentropy": 1.9499077796936035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21458172798156738, "step": 5294 }, { "epoch": 0.10592, "grad_norm": 2.25, "grad_norm_var": 0.0084625244140625, "learning_rate": 0.0001, "loss": 4.479, "loss/crossentropy": 2.1434344053268433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2552379444241524, "step": 5296 }, { "epoch": 0.10596, "grad_norm": 2.359375, "grad_norm_var": 0.007819620768229167, "learning_rate": 0.0001, "loss": 4.7149, "loss/crossentropy": 1.9951340556144714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21683495491743088, "step": 5298 }, { "epoch": 0.106, "grad_norm": 2.5625, "grad_norm_var": 0.02330322265625, "learning_rate": 0.0001, "loss": 4.7855, "loss/crossentropy": 2.380235195159912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2756577730178833, "step": 5300 }, { "epoch": 0.10604, "grad_norm": 2.3125, "grad_norm_var": 0.02330322265625, "learning_rate": 0.0001, "loss": 4.8596, "loss/crossentropy": 2.298241972923279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598598450422287, "step": 5302 }, { "epoch": 0.10608, "grad_norm": 2.5, "grad_norm_var": 0.032136027018229166, "learning_rate": 0.0001, "loss": 4.8615, "loss/crossentropy": 2.093233823776245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23809552192687988, "step": 5304 }, { "epoch": 0.10612, "grad_norm": 2.15625, "grad_norm_var": 0.031493123372395834, "learning_rate": 0.0001, "loss": 4.5271, "loss/crossentropy": 2.0901564955711365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24874016642570496, "step": 5306 }, { "epoch": 0.10616, "grad_norm": 2.453125, "grad_norm_var": 0.0304107666015625, "learning_rate": 0.0001, "loss": 4.3193, "loss/crossentropy": 1.8029736280441284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20973137766122818, "step": 5308 }, { "epoch": 0.1062, "grad_norm": 2.25, "grad_norm_var": 0.03316650390625, "learning_rate": 0.0001, "loss": 4.4255, "loss/crossentropy": 2.4068437814712524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25310443341732025, "step": 5310 }, { "epoch": 0.10624, "grad_norm": 2.265625, "grad_norm_var": 0.031029256184895833, "learning_rate": 0.0001, "loss": 4.4499, "loss/crossentropy": 2.1125503182411194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2455870360136032, "step": 5312 }, { "epoch": 0.10628, "grad_norm": 2.234375, "grad_norm_var": 0.0323150634765625, "learning_rate": 0.0001, "loss": 4.4752, "loss/crossentropy": 2.0995737314224243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2418016716837883, "step": 5314 }, { "epoch": 0.10632, "grad_norm": 2.234375, "grad_norm_var": 0.0212890625, "learning_rate": 0.0001, "loss": 4.5873, "loss/crossentropy": 1.8753212690353394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24125799536705017, "step": 5316 }, { "epoch": 0.10636, "grad_norm": 2.390625, "grad_norm_var": 0.021451822916666665, "learning_rate": 0.0001, "loss": 4.7269, "loss/crossentropy": 2.0175408720970154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23516137897968292, "step": 5318 }, { "epoch": 0.1064, "grad_norm": 2.109375, "grad_norm_var": 0.00914306640625, "learning_rate": 0.0001, "loss": 4.4953, "loss/crossentropy": 2.2671592235565186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356845736503601, "step": 5320 }, { "epoch": 0.10644, "grad_norm": 2.140625, "grad_norm_var": 0.009468587239583333, "learning_rate": 0.0001, "loss": 4.5328, "loss/crossentropy": 2.142452359199524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26422591507434845, "step": 5322 }, { "epoch": 0.10648, "grad_norm": 2.3125, "grad_norm_var": 0.007225545247395834, "learning_rate": 0.0001, "loss": 4.4328, "loss/crossentropy": 1.9664896726608276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25856246054172516, "step": 5324 }, { "epoch": 0.10652, "grad_norm": 2.203125, "grad_norm_var": 0.007222493489583333, "learning_rate": 0.0001, "loss": 4.5531, "loss/crossentropy": 2.168110191822052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23274105042219162, "step": 5326 }, { "epoch": 0.10656, "grad_norm": 2.21875, "grad_norm_var": 0.00552978515625, "learning_rate": 0.0001, "loss": 4.5242, "loss/crossentropy": 2.006514251232147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532104551792145, "step": 5328 }, { "epoch": 0.1066, "grad_norm": 2.296875, "grad_norm_var": 0.0070220947265625, "learning_rate": 0.0001, "loss": 4.5593, "loss/crossentropy": 2.462701439857483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2771739661693573, "step": 5330 }, { "epoch": 0.10664, "grad_norm": 2.203125, "grad_norm_var": 0.0076324462890625, "learning_rate": 0.0001, "loss": 4.4076, "loss/crossentropy": 2.0889209508895874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2479858472943306, "step": 5332 }, { "epoch": 0.10668, "grad_norm": 2.25, "grad_norm_var": 0.008356730143229166, "learning_rate": 0.0001, "loss": 4.329, "loss/crossentropy": 1.8056100606918335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22194840013980865, "step": 5334 }, { "epoch": 0.10672, "grad_norm": 2.171875, "grad_norm_var": 0.0075185139973958336, "learning_rate": 0.0001, "loss": 4.703, "loss/crossentropy": 2.32460880279541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28185322880744934, "step": 5336 }, { "epoch": 0.10676, "grad_norm": 2.140625, "grad_norm_var": 0.0075185139973958336, "learning_rate": 0.0001, "loss": 4.6388, "loss/crossentropy": 2.238978862762451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24314726889133453, "step": 5338 }, { "epoch": 0.1068, "grad_norm": 2.0625, "grad_norm_var": 0.00914306640625, "learning_rate": 0.0001, "loss": 4.4161, "loss/crossentropy": 1.8914734721183777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20767460763454437, "step": 5340 }, { "epoch": 0.10684, "grad_norm": 2.328125, "grad_norm_var": 0.010445149739583333, "learning_rate": 0.0001, "loss": 4.5628, "loss/crossentropy": 1.9704068899154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23617815226316452, "step": 5342 }, { "epoch": 0.10688, "grad_norm": 2.40625, "grad_norm_var": 0.013374837239583333, "learning_rate": 0.0001, "loss": 4.214, "loss/crossentropy": 1.8539690971374512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2218247577548027, "step": 5344 }, { "epoch": 0.10692, "grad_norm": 2.28125, "grad_norm_var": 0.012596638997395833, "learning_rate": 0.0001, "loss": 4.6077, "loss/crossentropy": 1.982038140296936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23828908801078796, "step": 5346 }, { "epoch": 0.10696, "grad_norm": 2.28125, "grad_norm_var": 0.013313802083333333, "learning_rate": 0.0001, "loss": 4.2879, "loss/crossentropy": 1.8247870802879333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22487633675336838, "step": 5348 }, { "epoch": 0.107, "grad_norm": 2.1875, "grad_norm_var": 0.011253865559895833, "learning_rate": 0.0001, "loss": 4.382, "loss/crossentropy": 2.0704214572906494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24332743138074875, "step": 5350 }, { "epoch": 0.10704, "grad_norm": 2.5, "grad_norm_var": 0.013509114583333334, "learning_rate": 0.0001, "loss": 4.709, "loss/crossentropy": 2.037345290184021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2515557184815407, "step": 5352 }, { "epoch": 0.10708, "grad_norm": 2.609375, "grad_norm_var": 0.020702107747395834, "learning_rate": 0.0001, "loss": 4.5427, "loss/crossentropy": 2.0561426877975464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24030926823616028, "step": 5354 }, { "epoch": 0.10712, "grad_norm": 2.3125, "grad_norm_var": 0.017943318684895834, "learning_rate": 0.0001, "loss": 4.3108, "loss/crossentropy": 1.6871100068092346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2526541203260422, "step": 5356 }, { "epoch": 0.10716, "grad_norm": 2.390625, "grad_norm_var": 0.01978759765625, "learning_rate": 0.0001, "loss": 4.5238, "loss/crossentropy": 2.0133201479911804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23663413524627686, "step": 5358 }, { "epoch": 0.1072, "grad_norm": 2.34375, "grad_norm_var": 0.0170562744140625, "learning_rate": 0.0001, "loss": 4.557, "loss/crossentropy": 2.0627574920654297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24395380914211273, "step": 5360 }, { "epoch": 0.10724, "grad_norm": 2.140625, "grad_norm_var": 0.020929972330729168, "learning_rate": 0.0001, "loss": 4.5495, "loss/crossentropy": 2.280818462371826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2806383967399597, "step": 5362 }, { "epoch": 0.10728, "grad_norm": 2.5, "grad_norm_var": 0.022684733072916668, "learning_rate": 0.0001, "loss": 4.4743, "loss/crossentropy": 2.002636671066284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23986798524856567, "step": 5364 }, { "epoch": 0.10732, "grad_norm": 2.453125, "grad_norm_var": 0.022298177083333332, "learning_rate": 0.0001, "loss": 4.6816, "loss/crossentropy": 2.042721927165985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23199205100536346, "step": 5366 }, { "epoch": 0.10736, "grad_norm": 2.25, "grad_norm_var": 0.021540323893229168, "learning_rate": 0.0001, "loss": 4.3225, "loss/crossentropy": 2.1047908663749695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23987916857004166, "step": 5368 }, { "epoch": 0.1074, "grad_norm": 2.1875, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 4.4827, "loss/crossentropy": 2.0513075590133667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.242512047290802, "step": 5370 }, { "epoch": 0.10744, "grad_norm": 2.4375, "grad_norm_var": 0.015913899739583334, "learning_rate": 0.0001, "loss": 4.4452, "loss/crossentropy": 1.9151215553283691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23007020354270935, "step": 5372 }, { "epoch": 0.10748, "grad_norm": 2.125, "grad_norm_var": 0.019303385416666666, "learning_rate": 0.0001, "loss": 4.2344, "loss/crossentropy": 1.8759313821792603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068658247590065, "step": 5374 }, { "epoch": 0.10752, "grad_norm": 2.46875, "grad_norm_var": 0.020807902018229168, "learning_rate": 0.0001, "loss": 4.4667, "loss/crossentropy": 2.1500572562217712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24392293393611908, "step": 5376 }, { "epoch": 0.10756, "grad_norm": 2.109375, "grad_norm_var": 0.018355305989583334, "learning_rate": 0.0001, "loss": 4.0917, "loss/crossentropy": 1.6089633703231812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20190145075321198, "step": 5378 }, { "epoch": 0.1076, "grad_norm": 2.1875, "grad_norm_var": 0.014452107747395833, "learning_rate": 0.0001, "loss": 4.6521, "loss/crossentropy": 2.1967561841011047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2749984338879585, "step": 5380 }, { "epoch": 0.10764, "grad_norm": 2.171875, "grad_norm_var": 0.011311848958333334, "learning_rate": 0.0001, "loss": 4.0843, "loss/crossentropy": 1.8293656706809998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20650158822536469, "step": 5382 }, { "epoch": 0.10768, "grad_norm": 2.125, "grad_norm_var": 0.011847941080729167, "learning_rate": 0.0001, "loss": 4.3441, "loss/crossentropy": 2.3964673280715942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2814205437898636, "step": 5384 }, { "epoch": 0.10772, "grad_norm": 2.4375, "grad_norm_var": 0.021842447916666667, "learning_rate": 0.0001, "loss": 4.8872, "loss/crossentropy": 2.4995274543762207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28704003244638443, "step": 5386 }, { "epoch": 0.10776, "grad_norm": 2.296875, "grad_norm_var": 0.019462076822916667, "learning_rate": 0.0001, "loss": 4.6066, "loss/crossentropy": 2.0650060176849365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23606212437152863, "step": 5388 }, { "epoch": 0.1078, "grad_norm": 2.203125, "grad_norm_var": 0.01640625, "learning_rate": 0.0001, "loss": 4.3723, "loss/crossentropy": 2.3049341440200806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23550046980381012, "step": 5390 }, { "epoch": 0.10784, "grad_norm": 2.21875, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 4.813, "loss/crossentropy": 2.2687143087387085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2854095697402954, "step": 5392 }, { "epoch": 0.10788, "grad_norm": 2.28125, "grad_norm_var": 0.014557902018229167, "learning_rate": 0.0001, "loss": 4.6267, "loss/crossentropy": 2.029325544834137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23444775491952896, "step": 5394 }, { "epoch": 0.10792, "grad_norm": 2.3125, "grad_norm_var": 0.013895670572916666, "learning_rate": 0.0001, "loss": 4.5214, "loss/crossentropy": 2.2012031078338623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27986764907836914, "step": 5396 }, { "epoch": 0.10796, "grad_norm": 2.3125, "grad_norm_var": 0.012626139322916667, "learning_rate": 0.0001, "loss": 4.65, "loss/crossentropy": 2.2396020889282227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24839117377996445, "step": 5398 }, { "epoch": 0.108, "grad_norm": 2.3125, "grad_norm_var": 0.009566243489583333, "learning_rate": 0.0001, "loss": 4.4634, "loss/crossentropy": 2.1481886506080627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545444592833519, "step": 5400 }, { "epoch": 0.10804, "grad_norm": 2.21875, "grad_norm_var": 0.006005859375, "learning_rate": 0.0001, "loss": 4.6109, "loss/crossentropy": 1.9799351692199707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23296385258436203, "step": 5402 }, { "epoch": 0.10808, "grad_norm": 2.34375, "grad_norm_var": 0.006180826822916667, "learning_rate": 0.0001, "loss": 4.5612, "loss/crossentropy": 1.845237910747528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316332384943962, "step": 5404 }, { "epoch": 0.10812, "grad_norm": 2.234375, "grad_norm_var": 0.005692545572916667, "learning_rate": 0.0001, "loss": 4.4825, "loss/crossentropy": 2.078865647315979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.251836359500885, "step": 5406 }, { "epoch": 0.10816, "grad_norm": 2.171875, "grad_norm_var": 0.006245930989583333, "learning_rate": 0.0001, "loss": 4.4409, "loss/crossentropy": 2.031971752643585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23658733069896698, "step": 5408 }, { "epoch": 0.1082, "grad_norm": 2.21875, "grad_norm_var": 0.00504150390625, "learning_rate": 0.0001, "loss": 4.3034, "loss/crossentropy": 1.8173908591270447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2169174626469612, "step": 5410 }, { "epoch": 0.10824, "grad_norm": 2.25, "grad_norm_var": 0.0086090087890625, "learning_rate": 0.0001, "loss": 4.838, "loss/crossentropy": 2.2501285672187805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.240458145737648, "step": 5412 }, { "epoch": 0.10828, "grad_norm": 2.484375, "grad_norm_var": 0.015168253580729167, "learning_rate": 0.0001, "loss": 4.5449, "loss/crossentropy": 2.256573438644409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26129382848739624, "step": 5414 }, { "epoch": 0.10832, "grad_norm": 2.359375, "grad_norm_var": 0.0153717041015625, "learning_rate": 0.0001, "loss": 4.7704, "loss/crossentropy": 2.2014705538749695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2334136888384819, "step": 5416 }, { "epoch": 0.10836, "grad_norm": 2.34375, "grad_norm_var": 0.013016764322916667, "learning_rate": 0.0001, "loss": 4.4046, "loss/crossentropy": 1.8590609431266785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2271011471748352, "step": 5418 }, { "epoch": 0.1084, "grad_norm": 2.40625, "grad_norm_var": 0.017378743489583334, "learning_rate": 0.0001, "loss": 4.9419, "loss/crossentropy": 2.2923961877822876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25833888351917267, "step": 5420 }, { "epoch": 0.10844, "grad_norm": 2.21875, "grad_norm_var": 0.017723592122395833, "learning_rate": 0.0001, "loss": 4.4535, "loss/crossentropy": 2.1932299733161926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26209479570388794, "step": 5422 }, { "epoch": 0.10848, "grad_norm": 2.390625, "grad_norm_var": 0.016185506184895834, "learning_rate": 0.0001, "loss": 4.7057, "loss/crossentropy": 2.3909924030303955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2525275945663452, "step": 5424 }, { "epoch": 0.10852, "grad_norm": 2.296875, "grad_norm_var": 0.011324055989583333, "learning_rate": 0.0001, "loss": 4.7509, "loss/crossentropy": 2.423817992210388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2635423541069031, "step": 5426 }, { "epoch": 0.10856, "grad_norm": 2.34375, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.7082, "loss/crossentropy": 1.9641632437705994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460889369249344, "step": 5428 }, { "epoch": 0.1086, "grad_norm": 2.34375, "grad_norm_var": 0.0077707926432291664, "learning_rate": 0.0001, "loss": 4.6347, "loss/crossentropy": 2.027769148349762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2357894703745842, "step": 5430 }, { "epoch": 0.10864, "grad_norm": 2.53125, "grad_norm_var": 0.0128570556640625, "learning_rate": 0.0001, "loss": 4.4833, "loss/crossentropy": 2.122319996356964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2445123866200447, "step": 5432 }, { "epoch": 0.10868, "grad_norm": 3.53125, "grad_norm_var": 0.0993072509765625, "learning_rate": 0.0001, "loss": 4.6332, "loss/crossentropy": 1.8631052374839783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23456327617168427, "step": 5434 }, { "epoch": 0.10872, "grad_norm": 2.59375, "grad_norm_var": 0.1000885009765625, "learning_rate": 0.0001, "loss": 4.6022, "loss/crossentropy": 2.184281885623932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24073782563209534, "step": 5436 }, { "epoch": 0.10876, "grad_norm": 2.453125, "grad_norm_var": 0.09521077473958334, "learning_rate": 0.0001, "loss": 4.7912, "loss/crossentropy": 1.9587833881378174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22247321158647537, "step": 5438 }, { "epoch": 0.1088, "grad_norm": 2.296875, "grad_norm_var": 0.09562886555989583, "learning_rate": 0.0001, "loss": 4.5185, "loss/crossentropy": 2.334655284881592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24817728251218796, "step": 5440 }, { "epoch": 0.10884, "grad_norm": 2.109375, "grad_norm_var": 0.10161031087239583, "learning_rate": 0.0001, "loss": 4.3817, "loss/crossentropy": 2.1424371004104614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2622353136539459, "step": 5442 }, { "epoch": 0.10888, "grad_norm": 2.203125, "grad_norm_var": 0.10598958333333333, "learning_rate": 0.0001, "loss": 4.5876, "loss/crossentropy": 2.0363662242889404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2450125440955162, "step": 5444 }, { "epoch": 0.10892, "grad_norm": 2.1875, "grad_norm_var": 0.110791015625, "learning_rate": 0.0001, "loss": 4.4015, "loss/crossentropy": 2.0536006689071655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23283181339502335, "step": 5446 }, { "epoch": 0.10896, "grad_norm": 2.234375, "grad_norm_var": 0.10741780598958334, "learning_rate": 0.0001, "loss": 4.5194, "loss/crossentropy": 2.2678059339523315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24513862282037735, "step": 5448 }, { "epoch": 0.109, "grad_norm": 2.203125, "grad_norm_var": 0.02115478515625, "learning_rate": 0.0001, "loss": 4.7404, "loss/crossentropy": 2.406686782836914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2901146113872528, "step": 5450 }, { "epoch": 0.10904, "grad_norm": 2.296875, "grad_norm_var": 0.017801920572916668, "learning_rate": 0.0001, "loss": 4.4724, "loss/crossentropy": 2.352605938911438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25215400755405426, "step": 5452 }, { "epoch": 0.10908, "grad_norm": 2.203125, "grad_norm_var": 0.014546712239583334, "learning_rate": 0.0001, "loss": 4.5593, "loss/crossentropy": 1.9139850735664368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22682765871286392, "step": 5454 }, { "epoch": 0.10912, "grad_norm": 2.171875, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 4.6163, "loss/crossentropy": 2.3240445852279663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24574412405490875, "step": 5456 }, { "epoch": 0.10916, "grad_norm": 2.21875, "grad_norm_var": 0.0123687744140625, "learning_rate": 0.0001, "loss": 4.3356, "loss/crossentropy": 1.9347040057182312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159302830696106, "step": 5458 }, { "epoch": 0.1092, "grad_norm": 2.21875, "grad_norm_var": 0.012360636393229167, "learning_rate": 0.0001, "loss": 4.6539, "loss/crossentropy": 1.8933109641075134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.284725621342659, "step": 5460 }, { "epoch": 0.10924, "grad_norm": 2.15625, "grad_norm_var": 0.0129058837890625, "learning_rate": 0.0001, "loss": 4.2488, "loss/crossentropy": 2.3611297607421875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2651517689228058, "step": 5462 }, { "epoch": 0.10928, "grad_norm": 2.140625, "grad_norm_var": 0.0235504150390625, "learning_rate": 0.0001, "loss": 4.4564, "loss/crossentropy": 1.828608751296997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2337760180234909, "step": 5464 }, { "epoch": 0.10932, "grad_norm": 2.140625, "grad_norm_var": 0.015412394205729167, "learning_rate": 0.0001, "loss": 4.321, "loss/crossentropy": 2.1374374628067017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2525453567504883, "step": 5466 }, { "epoch": 0.10936, "grad_norm": 2.125, "grad_norm_var": 0.0150543212890625, "learning_rate": 0.0001, "loss": 4.5307, "loss/crossentropy": 1.8054441213607788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2252344712615013, "step": 5468 }, { "epoch": 0.1094, "grad_norm": 2.421875, "grad_norm_var": 0.022972615559895833, "learning_rate": 0.0001, "loss": 4.616, "loss/crossentropy": 2.1468498706817627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2604861631989479, "step": 5470 }, { "epoch": 0.10944, "grad_norm": 2.359375, "grad_norm_var": 0.022684733072916668, "learning_rate": 0.0001, "loss": 4.7298, "loss/crossentropy": 2.2180548906326294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2603686898946762, "step": 5472 }, { "epoch": 0.10948, "grad_norm": 2.265625, "grad_norm_var": 0.022261555989583334, "learning_rate": 0.0001, "loss": 4.5263, "loss/crossentropy": 1.9773708581924438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23676057159900665, "step": 5474 }, { "epoch": 0.10952, "grad_norm": 2.40625, "grad_norm_var": 0.023485310872395835, "learning_rate": 0.0001, "loss": 4.6156, "loss/crossentropy": 1.9277283549308777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22822558879852295, "step": 5476 }, { "epoch": 0.10956, "grad_norm": 2.34375, "grad_norm_var": 0.022484334309895833, "learning_rate": 0.0001, "loss": 4.5529, "loss/crossentropy": 2.0625431537628174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565220817923546, "step": 5478 }, { "epoch": 0.1096, "grad_norm": 2.25, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 4.7956, "loss/crossentropy": 2.383894443511963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2563806623220444, "step": 5480 }, { "epoch": 0.10964, "grad_norm": 2.171875, "grad_norm_var": 0.0120758056640625, "learning_rate": 0.0001, "loss": 4.5226, "loss/crossentropy": 2.409442663192749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2637571170926094, "step": 5482 }, { "epoch": 0.10968, "grad_norm": 2.203125, "grad_norm_var": 0.0151763916015625, "learning_rate": 0.0001, "loss": 4.743, "loss/crossentropy": 2.1789854764938354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23846548050642014, "step": 5484 }, { "epoch": 0.10972, "grad_norm": 2.265625, "grad_norm_var": 0.011839803059895833, "learning_rate": 0.0001, "loss": 4.4108, "loss/crossentropy": 2.127842903137207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2622772455215454, "step": 5486 }, { "epoch": 0.10976, "grad_norm": 2.1875, "grad_norm_var": 0.01412353515625, "learning_rate": 0.0001, "loss": 4.6032, "loss/crossentropy": 2.107556462287903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24714312702417374, "step": 5488 }, { "epoch": 0.1098, "grad_norm": 2.578125, "grad_norm_var": 0.022459920247395834, "learning_rate": 0.0001, "loss": 4.6525, "loss/crossentropy": 2.1959601640701294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23568396270275116, "step": 5490 }, { "epoch": 0.10984, "grad_norm": 2.171875, "grad_norm_var": 0.021826171875, "learning_rate": 0.0001, "loss": 4.5584, "loss/crossentropy": 2.1246761083602905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24083472788333893, "step": 5492 }, { "epoch": 0.10988, "grad_norm": 2.15625, "grad_norm_var": 0.025031534830729167, "learning_rate": 0.0001, "loss": 4.6436, "loss/crossentropy": 2.091724157333374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527346611022949, "step": 5494 }, { "epoch": 0.10992, "grad_norm": 2.109375, "grad_norm_var": 0.026056925455729168, "learning_rate": 0.0001, "loss": 4.3207, "loss/crossentropy": 1.8898470997810364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20916878432035446, "step": 5496 }, { "epoch": 0.10996, "grad_norm": 2.21875, "grad_norm_var": 0.025951131184895834, "learning_rate": 0.0001, "loss": 4.399, "loss/crossentropy": 2.1901716589927673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2409309297800064, "step": 5498 }, { "epoch": 0.11, "grad_norm": 2.234375, "grad_norm_var": 0.017626953125, "learning_rate": 0.0001, "loss": 4.6897, "loss/crossentropy": 2.1018574237823486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24559657275676727, "step": 5500 }, { "epoch": 0.11004, "grad_norm": 2.1875, "grad_norm_var": 0.019449869791666668, "learning_rate": 0.0001, "loss": 3.8159, "loss/crossentropy": 2.0575350522994995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23721858859062195, "step": 5502 }, { "epoch": 0.11008, "grad_norm": 2.15625, "grad_norm_var": 0.019710286458333334, "learning_rate": 0.0001, "loss": 4.6347, "loss/crossentropy": 2.1846336126327515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2378462702035904, "step": 5504 }, { "epoch": 0.11012, "grad_norm": 2.4375, "grad_norm_var": 0.014574178059895833, "learning_rate": 0.0001, "loss": 4.4028, "loss/crossentropy": 2.1359363198280334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26211391389369965, "step": 5506 }, { "epoch": 0.11016, "grad_norm": 2.265625, "grad_norm_var": 0.01416015625, "learning_rate": 0.0001, "loss": 4.5621, "loss/crossentropy": 2.236825942993164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557792589068413, "step": 5508 }, { "epoch": 0.1102, "grad_norm": 2.25, "grad_norm_var": 0.009521484375, "learning_rate": 0.0001, "loss": 4.3234, "loss/crossentropy": 2.3140580654144287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2554958164691925, "step": 5510 }, { "epoch": 0.11024, "grad_norm": 2.140625, "grad_norm_var": 0.00904541015625, "learning_rate": 0.0001, "loss": 4.4382, "loss/crossentropy": 1.7190355062484741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19964009523391724, "step": 5512 }, { "epoch": 0.11028, "grad_norm": 2.203125, "grad_norm_var": 0.014631144205729167, "learning_rate": 0.0001, "loss": 4.3831, "loss/crossentropy": 1.8326427340507507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2199154868721962, "step": 5514 }, { "epoch": 0.11032, "grad_norm": 2.109375, "grad_norm_var": 0.01549072265625, "learning_rate": 0.0001, "loss": 4.4494, "loss/crossentropy": 1.9013121724128723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20293578505516052, "step": 5516 }, { "epoch": 0.11036, "grad_norm": 2.1875, "grad_norm_var": 0.0138671875, "learning_rate": 0.0001, "loss": 4.7056, "loss/crossentropy": 2.0221983790397644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23435519635677338, "step": 5518 }, { "epoch": 0.1104, "grad_norm": 2.359375, "grad_norm_var": 0.0140533447265625, "learning_rate": 0.0001, "loss": 4.6157, "loss/crossentropy": 2.153970956802368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22127598524093628, "step": 5520 }, { "epoch": 0.11044, "grad_norm": 2.125, "grad_norm_var": 0.010835774739583333, "learning_rate": 0.0001, "loss": 4.4516, "loss/crossentropy": 1.8674496412277222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.232261061668396, "step": 5522 }, { "epoch": 0.11048, "grad_norm": 2.078125, "grad_norm_var": 0.012495930989583333, "learning_rate": 0.0001, "loss": 4.6528, "loss/crossentropy": 2.1575759649276733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22218701988458633, "step": 5524 }, { "epoch": 0.11052, "grad_norm": 2.25, "grad_norm_var": 0.012434895833333333, "learning_rate": 0.0001, "loss": 4.5553, "loss/crossentropy": 2.054452419281006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25137215852737427, "step": 5526 }, { "epoch": 0.11056, "grad_norm": 2.296875, "grad_norm_var": 0.022639973958333334, "learning_rate": 0.0001, "loss": 4.4789, "loss/crossentropy": 1.966478705406189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23289234936237335, "step": 5528 }, { "epoch": 0.1106, "grad_norm": 2.234375, "grad_norm_var": 0.017626953125, "learning_rate": 0.0001, "loss": 4.687, "loss/crossentropy": 2.171034336090088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23373593389987946, "step": 5530 }, { "epoch": 0.11064, "grad_norm": 1.953125, "grad_norm_var": 0.022362263997395833, "learning_rate": 0.0001, "loss": 4.142, "loss/crossentropy": 2.2416292428970337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24835532158613205, "step": 5532 }, { "epoch": 0.11068, "grad_norm": 2.53125, "grad_norm_var": 0.02720947265625, "learning_rate": 0.0001, "loss": 4.6252, "loss/crossentropy": 2.2599780559539795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24948878586292267, "step": 5534 }, { "epoch": 0.11072, "grad_norm": 2.34375, "grad_norm_var": 0.028902180989583335, "learning_rate": 0.0001, "loss": 4.6984, "loss/crossentropy": 2.2292014360427856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26616473495960236, "step": 5536 }, { "epoch": 0.11076, "grad_norm": 2.375, "grad_norm_var": 0.027424112955729166, "learning_rate": 0.0001, "loss": 4.336, "loss/crossentropy": 1.9285388588905334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22964124381542206, "step": 5538 }, { "epoch": 0.1108, "grad_norm": 2.171875, "grad_norm_var": 0.0251953125, "learning_rate": 0.0001, "loss": 4.2861, "loss/crossentropy": 1.864789366722107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2264304906129837, "step": 5540 }, { "epoch": 0.11084, "grad_norm": 2.234375, "grad_norm_var": 0.0250885009765625, "learning_rate": 0.0001, "loss": 4.5163, "loss/crossentropy": 1.8676912188529968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21919699758291245, "step": 5542 }, { "epoch": 0.11088, "grad_norm": 2.234375, "grad_norm_var": 0.0190338134765625, "learning_rate": 0.0001, "loss": 4.3612, "loss/crossentropy": 2.34523469209671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25638002157211304, "step": 5544 }, { "epoch": 0.11092, "grad_norm": 2.09375, "grad_norm_var": 0.020992024739583334, "learning_rate": 0.0001, "loss": 4.5165, "loss/crossentropy": 2.2903120517730713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25111711025238037, "step": 5546 }, { "epoch": 0.11096, "grad_norm": 2.15625, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 4.5336, "loss/crossentropy": 2.2106658220291138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22340595722198486, "step": 5548 }, { "epoch": 0.111, "grad_norm": 2.203125, "grad_norm_var": 0.013700358072916667, "learning_rate": 0.0001, "loss": 4.6305, "loss/crossentropy": 2.0777581334114075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2446284517645836, "step": 5550 }, { "epoch": 0.11104, "grad_norm": 2.234375, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 4.5507, "loss/crossentropy": 2.131237506866455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24113191664218903, "step": 5552 }, { "epoch": 0.11108, "grad_norm": 2.578125, "grad_norm_var": 0.018778483072916668, "learning_rate": 0.0001, "loss": 4.6337, "loss/crossentropy": 2.190987467765808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2528213635087013, "step": 5554 }, { "epoch": 0.11112, "grad_norm": 2.25, "grad_norm_var": 0.019527180989583334, "learning_rate": 0.0001, "loss": 4.4889, "loss/crossentropy": 2.26843523979187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27781064808368683, "step": 5556 }, { "epoch": 0.11116, "grad_norm": 2.25, "grad_norm_var": 0.019527180989583334, "learning_rate": 0.0001, "loss": 4.2242, "loss/crossentropy": 1.9507999420166016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23172564804553986, "step": 5558 }, { "epoch": 0.1112, "grad_norm": 2.28125, "grad_norm_var": 0.016966756184895834, "learning_rate": 0.0001, "loss": 4.4821, "loss/crossentropy": 2.0738234519958496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24497026205062866, "step": 5560 }, { "epoch": 0.11124, "grad_norm": 2.515625, "grad_norm_var": 0.019261678059895832, "learning_rate": 0.0001, "loss": 4.9501, "loss/crossentropy": 2.273179054260254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2917838394641876, "step": 5562 }, { "epoch": 0.11128, "grad_norm": 2.359375, "grad_norm_var": 0.017899576822916666, "learning_rate": 0.0001, "loss": 4.774, "loss/crossentropy": 2.085157036781311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2488701120018959, "step": 5564 }, { "epoch": 0.11132, "grad_norm": 3.109375, "grad_norm_var": 0.05950113932291667, "learning_rate": 0.0001, "loss": 4.2869, "loss/crossentropy": 2.0528116822242737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24717991054058075, "step": 5566 }, { "epoch": 0.11136, "grad_norm": 7.0, "grad_norm_var": 1.3981597900390625, "learning_rate": 0.0001, "loss": 4.4443, "loss/crossentropy": 2.0651500821113586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26806148886680603, "step": 5568 }, { "epoch": 0.1114, "grad_norm": 2.328125, "grad_norm_var": 1.4721588134765624, "learning_rate": 0.0001, "loss": 4.6162, "loss/crossentropy": 2.1860616207122803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23045460134744644, "step": 5570 }, { "epoch": 0.11144, "grad_norm": 2.328125, "grad_norm_var": 1.460399373372396, "learning_rate": 0.0001, "loss": 4.3629, "loss/crossentropy": 1.6931262016296387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20816385000944138, "step": 5572 }, { "epoch": 0.11148, "grad_norm": 2.28125, "grad_norm_var": 1.4581858317057292, "learning_rate": 0.0001, "loss": 4.3376, "loss/crossentropy": 2.199341118335724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2490597665309906, "step": 5574 }, { "epoch": 0.11152, "grad_norm": 2.484375, "grad_norm_var": 1.4561513264973958, "learning_rate": 0.0001, "loss": 4.6627, "loss/crossentropy": 2.2010069489479065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24095547199249268, "step": 5576 }, { "epoch": 0.11156, "grad_norm": 2.234375, "grad_norm_var": 1.4810129801432292, "learning_rate": 0.0001, "loss": 4.5243, "loss/crossentropy": 1.9907150864601135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122703418135643, "step": 5578 }, { "epoch": 0.1116, "grad_norm": 2.21875, "grad_norm_var": 1.5023508707682292, "learning_rate": 0.0001, "loss": 4.2618, "loss/crossentropy": 2.196335554122925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25111766904592514, "step": 5580 }, { "epoch": 0.11164, "grad_norm": 2.171875, "grad_norm_var": 1.5003000895182292, "learning_rate": 0.0001, "loss": 4.5104, "loss/crossentropy": 2.0762988924980164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23773670941591263, "step": 5582 }, { "epoch": 0.11168, "grad_norm": 2.140625, "grad_norm_var": 0.15530192057291667, "learning_rate": 0.0001, "loss": 4.3166, "loss/crossentropy": 2.0803143978118896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22162869572639465, "step": 5584 }, { "epoch": 0.11172, "grad_norm": 2.3125, "grad_norm_var": 0.0084136962890625, "learning_rate": 0.0001, "loss": 4.5596, "loss/crossentropy": 2.1821994185447693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26280316710472107, "step": 5586 }, { "epoch": 0.11176, "grad_norm": 2.171875, "grad_norm_var": 0.011847941080729167, "learning_rate": 0.0001, "loss": 4.4328, "loss/crossentropy": 2.1899439096450806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24677179753780365, "step": 5588 }, { "epoch": 0.1118, "grad_norm": 2.21875, "grad_norm_var": 0.0116851806640625, "learning_rate": 0.0001, "loss": 4.4361, "loss/crossentropy": 2.334734559059143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2481135129928589, "step": 5590 }, { "epoch": 0.11184, "grad_norm": 2.34375, "grad_norm_var": 0.007291666666666667, "learning_rate": 0.0001, "loss": 4.7838, "loss/crossentropy": 2.2976341247558594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2454354390501976, "step": 5592 }, { "epoch": 0.11188, "grad_norm": 2.3125, "grad_norm_var": 0.007477823893229167, "learning_rate": 0.0001, "loss": 4.7148, "loss/crossentropy": 2.3243749141693115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2735731601715088, "step": 5594 }, { "epoch": 0.11192, "grad_norm": 2.28125, "grad_norm_var": 0.007372029622395833, "learning_rate": 0.0001, "loss": 4.7124, "loss/crossentropy": 2.0328271985054016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21919244527816772, "step": 5596 }, { "epoch": 0.11196, "grad_norm": 2.234375, "grad_norm_var": 0.0077789306640625, "learning_rate": 0.0001, "loss": 4.4584, "loss/crossentropy": 2.249367594718933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2577860951423645, "step": 5598 }, { "epoch": 0.112, "grad_norm": 2.046875, "grad_norm_var": 0.009505208333333333, "learning_rate": 0.0001, "loss": 4.2594, "loss/crossentropy": 1.9844761490821838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142435386776924, "step": 5600 }, { "epoch": 0.11204, "grad_norm": 2.375, "grad_norm_var": 0.0111328125, "learning_rate": 0.0001, "loss": 4.6043, "loss/crossentropy": 2.1334372758865356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23296397179365158, "step": 5602 }, { "epoch": 0.11208, "grad_norm": 2.3125, "grad_norm_var": 0.0078084309895833336, "learning_rate": 0.0001, "loss": 4.5308, "loss/crossentropy": 2.119946002960205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23398507386446, "step": 5604 }, { "epoch": 0.11212, "grad_norm": 2.25, "grad_norm_var": 0.008040364583333333, "learning_rate": 0.0001, "loss": 4.5011, "loss/crossentropy": 2.0414544343948364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23420867323875427, "step": 5606 }, { "epoch": 0.11216, "grad_norm": 2.125, "grad_norm_var": 0.009130859375, "learning_rate": 0.0001, "loss": 4.2092, "loss/crossentropy": 1.9592725038528442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23922864347696304, "step": 5608 }, { "epoch": 0.1122, "grad_norm": 2.4375, "grad_norm_var": 0.011188761393229166, "learning_rate": 0.0001, "loss": 4.4795, "loss/crossentropy": 2.150269627571106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2528265416622162, "step": 5610 }, { "epoch": 0.11224, "grad_norm": 2.578125, "grad_norm_var": 0.3099772135416667, "learning_rate": 0.0001, "loss": 4.6234, "loss/crossentropy": 2.0591378211975098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2430240362882614, "step": 5612 }, { "epoch": 0.11228, "grad_norm": 2.171875, "grad_norm_var": 0.30794169108072916, "learning_rate": 0.0001, "loss": 4.4251, "loss/crossentropy": 2.2132861614227295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22676552087068558, "step": 5614 }, { "epoch": 0.11232, "grad_norm": 2.375, "grad_norm_var": 0.3002237955729167, "learning_rate": 0.0001, "loss": 4.4332, "loss/crossentropy": 1.9607917070388794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23068836331367493, "step": 5616 }, { "epoch": 0.11236, "grad_norm": 2.25, "grad_norm_var": 0.3021321614583333, "learning_rate": 0.0001, "loss": 4.549, "loss/crossentropy": 2.2245940566062927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25029121339321136, "step": 5618 }, { "epoch": 0.1124, "grad_norm": 2.234375, "grad_norm_var": 0.30278218587239586, "learning_rate": 0.0001, "loss": 4.4539, "loss/crossentropy": 2.2511253356933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25143079459667206, "step": 5620 }, { "epoch": 0.11244, "grad_norm": 2.328125, "grad_norm_var": 0.3026194254557292, "learning_rate": 0.0001, "loss": 4.4632, "loss/crossentropy": 2.2945470809936523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2196483463048935, "step": 5622 }, { "epoch": 0.11248, "grad_norm": 2.359375, "grad_norm_var": 0.29273681640625, "learning_rate": 0.0001, "loss": 4.8769, "loss/crossentropy": 2.2266393899917603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24020668864250183, "step": 5624 }, { "epoch": 0.11252, "grad_norm": 2.359375, "grad_norm_var": 0.29136454264322914, "learning_rate": 0.0001, "loss": 4.742, "loss/crossentropy": 2.2835845947265625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27199871838092804, "step": 5626 }, { "epoch": 0.11256, "grad_norm": 2.859375, "grad_norm_var": 0.026383463541666666, "learning_rate": 0.0001, "loss": 4.4213, "loss/crossentropy": 1.9576718211174011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21716968715190887, "step": 5628 }, { "epoch": 0.1126, "grad_norm": 2.375, "grad_norm_var": 0.044611612955729164, "learning_rate": 0.0001, "loss": 4.7695, "loss/crossentropy": 2.0955676436424255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24259109795093536, "step": 5630 }, { "epoch": 0.11264, "grad_norm": 2.5625, "grad_norm_var": 0.044840494791666664, "learning_rate": 0.0001, "loss": 4.4127, "loss/crossentropy": 2.119523346424103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2478165179491043, "step": 5632 }, { "epoch": 0.11268, "grad_norm": 2.265625, "grad_norm_var": 0.0444488525390625, "learning_rate": 0.0001, "loss": 4.5591, "loss/crossentropy": 2.189425826072693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24532486498355865, "step": 5634 }, { "epoch": 0.11272, "grad_norm": 2.390625, "grad_norm_var": 0.0467193603515625, "learning_rate": 0.0001, "loss": 4.4969, "loss/crossentropy": 2.215874433517456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24533041566610336, "step": 5636 }, { "epoch": 0.11276, "grad_norm": 2.515625, "grad_norm_var": 0.04868876139322917, "learning_rate": 0.0001, "loss": 4.5657, "loss/crossentropy": 2.226451873779297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545652836561203, "step": 5638 }, { "epoch": 0.1128, "grad_norm": 2.296875, "grad_norm_var": 0.05181884765625, "learning_rate": 0.0001, "loss": 4.4779, "loss/crossentropy": 2.0343592762947083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2503928989171982, "step": 5640 }, { "epoch": 0.11284, "grad_norm": 2.25, "grad_norm_var": 0.0535552978515625, "learning_rate": 0.0001, "loss": 4.6253, "loss/crossentropy": 2.142001748085022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24624411761760712, "step": 5642 }, { "epoch": 0.11288, "grad_norm": 2.140625, "grad_norm_var": 0.039383951822916666, "learning_rate": 0.0001, "loss": 4.4212, "loss/crossentropy": 1.822394609451294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21315739303827286, "step": 5644 }, { "epoch": 0.11292, "grad_norm": 2.140625, "grad_norm_var": 0.018863932291666666, "learning_rate": 0.0001, "loss": 4.1611, "loss/crossentropy": 2.3221731185913086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24407917261123657, "step": 5646 }, { "epoch": 0.11296, "grad_norm": 2.234375, "grad_norm_var": 0.012531534830729166, "learning_rate": 0.0001, "loss": 4.5591, "loss/crossentropy": 1.9784467816352844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22578515857458115, "step": 5648 }, { "epoch": 0.113, "grad_norm": 2.328125, "grad_norm_var": 0.0142730712890625, "learning_rate": 0.0001, "loss": 4.636, "loss/crossentropy": 2.2148635387420654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25031210482120514, "step": 5650 }, { "epoch": 0.11304, "grad_norm": 2.21875, "grad_norm_var": 0.012369791666666666, "learning_rate": 0.0001, "loss": 4.4644, "loss/crossentropy": 1.9204095602035522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22073335200548172, "step": 5652 }, { "epoch": 0.11308, "grad_norm": 2.421875, "grad_norm_var": 0.009764607747395833, "learning_rate": 0.0001, "loss": 4.6601, "loss/crossentropy": 2.092605173587799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24161123484373093, "step": 5654 }, { "epoch": 0.11312, "grad_norm": 2.546875, "grad_norm_var": 0.0163726806640625, "learning_rate": 0.0001, "loss": 4.5977, "loss/crossentropy": 1.9546263217926025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22741927206516266, "step": 5656 }, { "epoch": 0.11316, "grad_norm": 2.28125, "grad_norm_var": 0.016974894205729167, "learning_rate": 0.0001, "loss": 4.6585, "loss/crossentropy": 2.256605863571167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560018301010132, "step": 5658 }, { "epoch": 0.1132, "grad_norm": 2.21875, "grad_norm_var": 0.0157135009765625, "learning_rate": 0.0001, "loss": 4.3605, "loss/crossentropy": 2.24527370929718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.270521879196167, "step": 5660 }, { "epoch": 0.11324, "grad_norm": 2.5, "grad_norm_var": 0.014029947916666667, "learning_rate": 0.0001, "loss": 4.5429, "loss/crossentropy": 1.8154722452163696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22786997258663177, "step": 5662 }, { "epoch": 0.11328, "grad_norm": 2.359375, "grad_norm_var": 0.0143951416015625, "learning_rate": 0.0001, "loss": 4.4079, "loss/crossentropy": 2.135699689388275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.253468282520771, "step": 5664 }, { "epoch": 0.11332, "grad_norm": 2.203125, "grad_norm_var": 0.015087890625, "learning_rate": 0.0001, "loss": 4.5381, "loss/crossentropy": 2.15896338224411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25651170313358307, "step": 5666 }, { "epoch": 0.11336, "grad_norm": 2.40625, "grad_norm_var": 0.012035115559895834, "learning_rate": 0.0001, "loss": 4.7877, "loss/crossentropy": 2.115864336490631, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28546491265296936, "step": 5668 }, { "epoch": 0.1134, "grad_norm": 2.25, "grad_norm_var": 0.012214152018229167, "learning_rate": 0.0001, "loss": 4.4283, "loss/crossentropy": 2.2036256790161133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2397892326116562, "step": 5670 }, { "epoch": 0.11344, "grad_norm": 2.234375, "grad_norm_var": 0.008430989583333333, "learning_rate": 0.0001, "loss": 4.598, "loss/crossentropy": 2.4966647624969482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2984588146209717, "step": 5672 }, { "epoch": 0.11348, "grad_norm": 2.234375, "grad_norm_var": 0.008202107747395833, "learning_rate": 0.0001, "loss": 4.5289, "loss/crossentropy": 2.051860749721527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2748461365699768, "step": 5674 }, { "epoch": 0.11352, "grad_norm": 2.328125, "grad_norm_var": 0.007938639322916666, "learning_rate": 0.0001, "loss": 4.597, "loss/crossentropy": 2.046416461467743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2320190668106079, "step": 5676 }, { "epoch": 0.11356, "grad_norm": 2.21875, "grad_norm_var": 0.0054514567057291664, "learning_rate": 0.0001, "loss": 4.7389, "loss/crossentropy": 2.2385342121124268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21859309077262878, "step": 5678 }, { "epoch": 0.1136, "grad_norm": 2.265625, "grad_norm_var": 0.006843058268229166, "learning_rate": 0.0001, "loss": 4.2487, "loss/crossentropy": 1.8511550426483154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22065181285142899, "step": 5680 }, { "epoch": 0.11364, "grad_norm": 2.828125, "grad_norm_var": 0.031477864583333334, "learning_rate": 0.0001, "loss": 4.642, "loss/crossentropy": 2.304056167602539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29507844150066376, "step": 5682 }, { "epoch": 0.11368, "grad_norm": 2.4375, "grad_norm_var": 0.03329671223958333, "learning_rate": 0.0001, "loss": 4.5519, "loss/crossentropy": 1.993275225162506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33826952427625656, "step": 5684 }, { "epoch": 0.11372, "grad_norm": 2.140625, "grad_norm_var": 0.03498433430989583, "learning_rate": 0.0001, "loss": 4.4729, "loss/crossentropy": 2.1836347579956055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24539872258901596, "step": 5686 }, { "epoch": 0.11376, "grad_norm": 2.296875, "grad_norm_var": 0.03504231770833333, "learning_rate": 0.0001, "loss": 4.5908, "loss/crossentropy": 1.9467885494232178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2180013582110405, "step": 5688 }, { "epoch": 0.1138, "grad_norm": 2.78125, "grad_norm_var": 0.04907938639322917, "learning_rate": 0.0001, "loss": 4.2888, "loss/crossentropy": 1.9907563924789429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2551623433828354, "step": 5690 }, { "epoch": 0.11384, "grad_norm": 2.21875, "grad_norm_var": 0.049153645833333336, "learning_rate": 0.0001, "loss": 4.2186, "loss/crossentropy": 1.9452654719352722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24263620376586914, "step": 5692 }, { "epoch": 0.11388, "grad_norm": 2.171875, "grad_norm_var": 0.050093587239583334, "learning_rate": 0.0001, "loss": 4.5816, "loss/crossentropy": 2.2448811531066895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2508034110069275, "step": 5694 }, { "epoch": 0.11392, "grad_norm": 2.65625, "grad_norm_var": 0.055712890625, "learning_rate": 0.0001, "loss": 4.1868, "loss/crossentropy": 1.991935908794403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22709138691425323, "step": 5696 }, { "epoch": 0.11396, "grad_norm": 2.625, "grad_norm_var": 0.47700907389322916, "learning_rate": 0.0001, "loss": 4.8208, "loss/crossentropy": 2.0479623675346375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23980189859867096, "step": 5698 }, { "epoch": 0.114, "grad_norm": 2.15625, "grad_norm_var": 0.4779581705729167, "learning_rate": 0.0001, "loss": 4.4742, "loss/crossentropy": 1.9319151639938354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295292615890503, "step": 5700 }, { "epoch": 0.11404, "grad_norm": 2.296875, "grad_norm_var": 0.47330322265625, "learning_rate": 0.0001, "loss": 4.32, "loss/crossentropy": 1.985447645187378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.246543288230896, "step": 5702 }, { "epoch": 0.11408, "grad_norm": 2.84375, "grad_norm_var": 0.478271484375, "learning_rate": 0.0001, "loss": 4.6602, "loss/crossentropy": 2.0016521215438843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25966253876686096, "step": 5704 }, { "epoch": 0.11412, "grad_norm": 2.421875, "grad_norm_var": 0.47038472493489586, "learning_rate": 0.0001, "loss": 4.9207, "loss/crossentropy": 2.112374246120453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3251144737005234, "step": 5706 }, { "epoch": 0.11416, "grad_norm": 2.328125, "grad_norm_var": 0.4698720296223958, "learning_rate": 0.0001, "loss": 4.5223, "loss/crossentropy": 2.0931158661842346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275928407907486, "step": 5708 }, { "epoch": 0.1142, "grad_norm": 2.328125, "grad_norm_var": 0.46104227701822914, "learning_rate": 0.0001, "loss": 4.4481, "loss/crossentropy": 1.9867743849754333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419440597295761, "step": 5710 }, { "epoch": 0.11424, "grad_norm": 2.171875, "grad_norm_var": 0.4639312744140625, "learning_rate": 0.0001, "loss": 4.3874, "loss/crossentropy": 2.1822216510772705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2437271624803543, "step": 5712 }, { "epoch": 0.11428, "grad_norm": 6.46875, "grad_norm_var": 1.1088775634765624, "learning_rate": 0.0001, "loss": 4.5307, "loss/crossentropy": 2.412580370903015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3222763389348984, "step": 5714 }, { "epoch": 0.11432, "grad_norm": 2.28125, "grad_norm_var": 1.1019490559895833, "learning_rate": 0.0001, "loss": 4.2491, "loss/crossentropy": 1.676392376422882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24499019235372543, "step": 5716 }, { "epoch": 0.11436, "grad_norm": 2.328125, "grad_norm_var": 1.1039876302083333, "learning_rate": 0.0001, "loss": 4.4741, "loss/crossentropy": 1.7818856835365295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22935030609369278, "step": 5718 }, { "epoch": 0.1144, "grad_norm": 2.40625, "grad_norm_var": 1.0944732666015624, "learning_rate": 0.0001, "loss": 4.5445, "loss/crossentropy": 2.012014925479889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.231942281126976, "step": 5720 }, { "epoch": 0.11444, "grad_norm": 2.296875, "grad_norm_var": 1.1023834228515625, "learning_rate": 0.0001, "loss": 4.5061, "loss/crossentropy": 2.3663965463638306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.254768967628479, "step": 5722 }, { "epoch": 0.11448, "grad_norm": 2.34375, "grad_norm_var": 1.107982381184896, "learning_rate": 0.0001, "loss": 4.5878, "loss/crossentropy": 2.343206286430359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26609115302562714, "step": 5724 }, { "epoch": 0.11452, "grad_norm": 2.21875, "grad_norm_var": 1.1131337483723958, "learning_rate": 0.0001, "loss": 4.5047, "loss/crossentropy": 1.8696978092193604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24282050132751465, "step": 5726 }, { "epoch": 0.11456, "grad_norm": 2.0625, "grad_norm_var": 1.1170644124348958, "learning_rate": 0.0001, "loss": 4.2198, "loss/crossentropy": 2.1430450677871704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23211582750082016, "step": 5728 }, { "epoch": 0.1146, "grad_norm": 2.421875, "grad_norm_var": 0.013841756184895833, "learning_rate": 0.0001, "loss": 4.6868, "loss/crossentropy": 2.2231308221817017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23736387491226196, "step": 5730 }, { "epoch": 0.11464, "grad_norm": 2.421875, "grad_norm_var": 0.014069620768229167, "learning_rate": 0.0001, "loss": 4.4911, "loss/crossentropy": 1.8789280652999878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24081497639417648, "step": 5732 }, { "epoch": 0.11468, "grad_norm": 2.1875, "grad_norm_var": 0.014875284830729167, "learning_rate": 0.0001, "loss": 4.4478, "loss/crossentropy": 2.0491732358932495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20530463755130768, "step": 5734 }, { "epoch": 0.11472, "grad_norm": 2.296875, "grad_norm_var": 0.012516276041666666, "learning_rate": 0.0001, "loss": 4.4352, "loss/crossentropy": 2.067046642303467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24701344966888428, "step": 5736 }, { "epoch": 0.11476, "grad_norm": 2.234375, "grad_norm_var": 0.012791951497395834, "learning_rate": 0.0001, "loss": 4.4515, "loss/crossentropy": 2.0207647681236267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2243770807981491, "step": 5738 }, { "epoch": 0.1148, "grad_norm": 2.21875, "grad_norm_var": 0.013084920247395833, "learning_rate": 0.0001, "loss": 4.9205, "loss/crossentropy": 2.230514347553253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24377264082431793, "step": 5740 }, { "epoch": 0.11484, "grad_norm": 2.078125, "grad_norm_var": 0.013997395833333334, "learning_rate": 0.0001, "loss": 4.3831, "loss/crossentropy": 2.38068687915802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24760407954454422, "step": 5742 }, { "epoch": 0.11488, "grad_norm": 2.3125, "grad_norm_var": 0.012483723958333333, "learning_rate": 0.0001, "loss": 4.5598, "loss/crossentropy": 2.238909125328064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2503668889403343, "step": 5744 }, { "epoch": 0.11492, "grad_norm": 2.1875, "grad_norm_var": 0.015265909830729167, "learning_rate": 0.0001, "loss": 4.4848, "loss/crossentropy": 1.7423101663589478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20424582809209824, "step": 5746 }, { "epoch": 0.11496, "grad_norm": 2.25, "grad_norm_var": 0.0181304931640625, "learning_rate": 0.0001, "loss": 4.6754, "loss/crossentropy": 2.5906827449798584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24731651693582535, "step": 5748 }, { "epoch": 0.115, "grad_norm": 2.6875, "grad_norm_var": 0.025830078125, "learning_rate": 0.0001, "loss": 4.7427, "loss/crossentropy": 2.418861746788025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505089193582535, "step": 5750 }, { "epoch": 0.11504, "grad_norm": 2.765625, "grad_norm_var": 0.03658447265625, "learning_rate": 0.0001, "loss": 4.869, "loss/crossentropy": 2.158658504486084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2381347045302391, "step": 5752 }, { "epoch": 0.11508, "grad_norm": 2.40625, "grad_norm_var": 0.03860270182291667, "learning_rate": 0.0001, "loss": 4.5073, "loss/crossentropy": 2.0938435196876526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23639065027236938, "step": 5754 }, { "epoch": 0.11512, "grad_norm": 2.046875, "grad_norm_var": 0.0441314697265625, "learning_rate": 0.0001, "loss": 4.2227, "loss/crossentropy": 2.023799479007721, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24510329961776733, "step": 5756 }, { "epoch": 0.11516, "grad_norm": 2.265625, "grad_norm_var": 0.03854166666666667, "learning_rate": 0.0001, "loss": 4.5155, "loss/crossentropy": 2.3589184284210205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2778936177492142, "step": 5758 }, { "epoch": 0.1152, "grad_norm": 2.1875, "grad_norm_var": 0.041731770833333334, "learning_rate": 0.0001, "loss": 4.4209, "loss/crossentropy": 2.4897998571395874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23546544462442398, "step": 5760 }, { "epoch": 0.11524, "grad_norm": 2.1875, "grad_norm_var": 0.041747029622395834, "learning_rate": 0.0001, "loss": 4.4619, "loss/crossentropy": 2.0426196455955505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22708147019147873, "step": 5762 }, { "epoch": 0.11528, "grad_norm": 2.28125, "grad_norm_var": 0.0397369384765625, "learning_rate": 0.0001, "loss": 4.6661, "loss/crossentropy": 2.2582051753997803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25646351277828217, "step": 5764 }, { "epoch": 0.11532, "grad_norm": 2.21875, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 4.5929, "loss/crossentropy": 2.181105613708496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2380223199725151, "step": 5766 }, { "epoch": 0.11536, "grad_norm": 2.46875, "grad_norm_var": 0.026200358072916666, "learning_rate": 0.0001, "loss": 4.5078, "loss/crossentropy": 2.236580967903137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26678355038166046, "step": 5768 }, { "epoch": 0.1154, "grad_norm": 2.203125, "grad_norm_var": 0.020361328125, "learning_rate": 0.0001, "loss": 4.6232, "loss/crossentropy": 1.9383749961853027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24920085072517395, "step": 5770 }, { "epoch": 0.11544, "grad_norm": 2.15625, "grad_norm_var": 0.018529256184895832, "learning_rate": 0.0001, "loss": 4.6044, "loss/crossentropy": 2.2856688499450684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.258881650865078, "step": 5772 }, { "epoch": 0.11548, "grad_norm": 2.28125, "grad_norm_var": 0.0239410400390625, "learning_rate": 0.0001, "loss": 4.8391, "loss/crossentropy": 2.2897390127182007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24453644454479218, "step": 5774 }, { "epoch": 0.11552, "grad_norm": 2.25, "grad_norm_var": 0.014549763997395833, "learning_rate": 0.0001, "loss": 4.5698, "loss/crossentropy": 2.0502785444259644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24106161296367645, "step": 5776 }, { "epoch": 0.11556, "grad_norm": 2.21875, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 4.5665, "loss/crossentropy": 2.344050645828247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24690424650907516, "step": 5778 }, { "epoch": 0.1156, "grad_norm": 2.609375, "grad_norm_var": 0.021239217122395834, "learning_rate": 0.0001, "loss": 4.5273, "loss/crossentropy": 2.2274389266967773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25618284940719604, "step": 5780 }, { "epoch": 0.11564, "grad_norm": 2.203125, "grad_norm_var": 0.022184244791666665, "learning_rate": 0.0001, "loss": 4.4888, "loss/crossentropy": 2.063184678554535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22606099396944046, "step": 5782 }, { "epoch": 0.11568, "grad_norm": 2.203125, "grad_norm_var": 0.0206451416015625, "learning_rate": 0.0001, "loss": 4.2333, "loss/crossentropy": 2.093947410583496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23588553071022034, "step": 5784 }, { "epoch": 0.11572, "grad_norm": 2.015625, "grad_norm_var": 0.0246490478515625, "learning_rate": 0.0001, "loss": 4.426, "loss/crossentropy": 2.1599318981170654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20322780311107635, "step": 5786 }, { "epoch": 0.11576, "grad_norm": 2.40625, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 4.5398, "loss/crossentropy": 2.274693012237549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2704206556081772, "step": 5788 }, { "epoch": 0.1158, "grad_norm": 2.421875, "grad_norm_var": 0.0222076416015625, "learning_rate": 0.0001, "loss": 4.6755, "loss/crossentropy": 1.9712103009223938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28300437331199646, "step": 5790 }, { "epoch": 0.11584, "grad_norm": 2.09375, "grad_norm_var": 0.024344889322916667, "learning_rate": 0.0001, "loss": 4.2035, "loss/crossentropy": 2.027747690677643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24419110268354416, "step": 5792 }, { "epoch": 0.11588, "grad_norm": 2.34375, "grad_norm_var": 0.024251302083333332, "learning_rate": 0.0001, "loss": 4.5614, "loss/crossentropy": 2.162364959716797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2522001415491104, "step": 5794 }, { "epoch": 0.11592, "grad_norm": 2.28125, "grad_norm_var": 0.0166656494140625, "learning_rate": 0.0001, "loss": 4.4711, "loss/crossentropy": 2.558881998062134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26168718934059143, "step": 5796 }, { "epoch": 0.11596, "grad_norm": 2.125, "grad_norm_var": 0.0162994384765625, "learning_rate": 0.0001, "loss": 4.4108, "loss/crossentropy": 2.1027071475982666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24049362540245056, "step": 5798 }, { "epoch": 0.116, "grad_norm": 2.234375, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 4.6059, "loss/crossentropy": 2.488932490348816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2697457820177078, "step": 5800 }, { "epoch": 0.11604, "grad_norm": 2.375, "grad_norm_var": 0.0121734619140625, "learning_rate": 0.0001, "loss": 4.6706, "loss/crossentropy": 2.0441418886184692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23980429768562317, "step": 5802 }, { "epoch": 0.11608, "grad_norm": 2.375, "grad_norm_var": 0.0107421875, "learning_rate": 0.0001, "loss": 4.8471, "loss/crossentropy": 2.2873395681381226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3105090409517288, "step": 5804 }, { "epoch": 0.11612, "grad_norm": 2.265625, "grad_norm_var": 0.00826416015625, "learning_rate": 0.0001, "loss": 4.5388, "loss/crossentropy": 1.8773444890975952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20640414953231812, "step": 5806 }, { "epoch": 0.11616, "grad_norm": 2.375, "grad_norm_var": 0.007502237955729167, "learning_rate": 0.0001, "loss": 4.7338, "loss/crossentropy": 2.3736027479171753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2679155319929123, "step": 5808 }, { "epoch": 0.1162, "grad_norm": 2.28125, "grad_norm_var": 0.010676066080729166, "learning_rate": 0.0001, "loss": 4.0496, "loss/crossentropy": 2.071690082550049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21868111193180084, "step": 5810 }, { "epoch": 0.11624, "grad_norm": 2.390625, "grad_norm_var": 0.015425618489583333, "learning_rate": 0.0001, "loss": 4.4029, "loss/crossentropy": 2.0531184673309326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24457989633083344, "step": 5812 }, { "epoch": 0.11628, "grad_norm": 2.3125, "grad_norm_var": 0.015718587239583335, "learning_rate": 0.0001, "loss": 4.6511, "loss/crossentropy": 2.1161271929740906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22809523344039917, "step": 5814 }, { "epoch": 0.11632, "grad_norm": 2.046875, "grad_norm_var": 0.0196929931640625, "learning_rate": 0.0001, "loss": 4.2736, "loss/crossentropy": 1.6557151675224304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20782940834760666, "step": 5816 }, { "epoch": 0.11636, "grad_norm": 2.3125, "grad_norm_var": 0.03704020182291667, "learning_rate": 0.0001, "loss": 4.5977, "loss/crossentropy": 2.2499040365219116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2713513821363449, "step": 5818 }, { "epoch": 0.1164, "grad_norm": 2.390625, "grad_norm_var": 0.03723958333333333, "learning_rate": 0.0001, "loss": 4.6417, "loss/crossentropy": 2.3616446256637573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2613145411014557, "step": 5820 }, { "epoch": 0.11644, "grad_norm": 2.265625, "grad_norm_var": 0.0353515625, "learning_rate": 0.0001, "loss": 4.3744, "loss/crossentropy": 2.0932790637016296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23843754082918167, "step": 5822 }, { "epoch": 0.11648, "grad_norm": 2.25, "grad_norm_var": 0.03540751139322917, "learning_rate": 0.0001, "loss": 4.4089, "loss/crossentropy": 2.128177046775818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26579540967941284, "step": 5824 }, { "epoch": 0.11652, "grad_norm": 2.296875, "grad_norm_var": 0.031281534830729166, "learning_rate": 0.0001, "loss": 4.6942, "loss/crossentropy": 2.332372784614563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27953268587589264, "step": 5826 }, { "epoch": 0.11656, "grad_norm": 2.1875, "grad_norm_var": 0.0263092041015625, "learning_rate": 0.0001, "loss": 4.4615, "loss/crossentropy": 2.360959053039551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26249848306179047, "step": 5828 }, { "epoch": 0.1166, "grad_norm": 2.0, "grad_norm_var": 0.030745442708333334, "learning_rate": 0.0001, "loss": 3.9757, "loss/crossentropy": 1.9800339341163635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22766301035881042, "step": 5830 }, { "epoch": 0.11664, "grad_norm": 2.390625, "grad_norm_var": 0.029442342122395833, "learning_rate": 0.0001, "loss": 4.3287, "loss/crossentropy": 2.082980155944824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2368428260087967, "step": 5832 }, { "epoch": 0.11668, "grad_norm": 2.34375, "grad_norm_var": 0.011986287434895833, "learning_rate": 0.0001, "loss": 4.4441, "loss/crossentropy": 2.093027710914612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21121951937675476, "step": 5834 }, { "epoch": 0.11672, "grad_norm": 2.296875, "grad_norm_var": 0.011031087239583333, "learning_rate": 0.0001, "loss": 4.5008, "loss/crossentropy": 2.1329175233840942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25303877890110016, "step": 5836 }, { "epoch": 0.11676, "grad_norm": 2.21875, "grad_norm_var": 0.022932942708333334, "learning_rate": 0.0001, "loss": 4.5174, "loss/crossentropy": 1.79305762052536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20448636263608932, "step": 5838 }, { "epoch": 0.1168, "grad_norm": 2.28125, "grad_norm_var": 0.024933878580729166, "learning_rate": 0.0001, "loss": 4.2722, "loss/crossentropy": 1.957836627960205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22745755314826965, "step": 5840 }, { "epoch": 0.11684, "grad_norm": 2.25, "grad_norm_var": 0.025007120768229165, "learning_rate": 0.0001, "loss": 4.3773, "loss/crossentropy": 2.1167174577713013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2437898963689804, "step": 5842 }, { "epoch": 0.11688, "grad_norm": 2.1875, "grad_norm_var": 0.024800618489583332, "learning_rate": 0.0001, "loss": 4.4856, "loss/crossentropy": 2.3288447856903076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2355937883257866, "step": 5844 }, { "epoch": 0.11692, "grad_norm": 2.125, "grad_norm_var": 0.022337849934895834, "learning_rate": 0.0001, "loss": 4.4774, "loss/crossentropy": 2.2526416778564453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24917340278625488, "step": 5846 }, { "epoch": 0.11696, "grad_norm": 2.25, "grad_norm_var": 0.0198883056640625, "learning_rate": 0.0001, "loss": 4.391, "loss/crossentropy": 2.194224774837494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2625589966773987, "step": 5848 }, { "epoch": 0.117, "grad_norm": 2.34375, "grad_norm_var": 0.020116170247395832, "learning_rate": 0.0001, "loss": 4.4955, "loss/crossentropy": 2.0156877040863037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22422078251838684, "step": 5850 }, { "epoch": 0.11704, "grad_norm": 2.21875, "grad_norm_var": 0.01962890625, "learning_rate": 0.0001, "loss": 4.6231, "loss/crossentropy": 2.2480785846710205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24705884605646133, "step": 5852 }, { "epoch": 0.11708, "grad_norm": 2.25, "grad_norm_var": 0.0078521728515625, "learning_rate": 0.0001, "loss": 4.4817, "loss/crossentropy": 2.0915993452072144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24669666588306427, "step": 5854 }, { "epoch": 0.11712, "grad_norm": 2.171875, "grad_norm_var": 0.0062652587890625, "learning_rate": 0.0001, "loss": 4.4444, "loss/crossentropy": 2.1283876299858093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24228712916374207, "step": 5856 }, { "epoch": 0.11716, "grad_norm": 2.625, "grad_norm_var": 0.015523274739583334, "learning_rate": 0.0001, "loss": 4.6582, "loss/crossentropy": 2.028861939907074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2281404584646225, "step": 5858 }, { "epoch": 0.1172, "grad_norm": 2.15625, "grad_norm_var": 0.015852864583333334, "learning_rate": 0.0001, "loss": 4.2293, "loss/crossentropy": 2.154610753059387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24233703315258026, "step": 5860 }, { "epoch": 0.11724, "grad_norm": 2.421875, "grad_norm_var": 0.015787760416666668, "learning_rate": 0.0001, "loss": 4.5059, "loss/crossentropy": 1.9396602511405945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22303076088428497, "step": 5862 }, { "epoch": 0.11728, "grad_norm": 2.578125, "grad_norm_var": 0.021061197916666666, "learning_rate": 0.0001, "loss": 4.7447, "loss/crossentropy": 2.053893029689789, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26897912472486496, "step": 5864 }, { "epoch": 0.11732, "grad_norm": 2.0625, "grad_norm_var": 0.0263336181640625, "learning_rate": 0.0001, "loss": 4.3529, "loss/crossentropy": 1.990949273109436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22180908918380737, "step": 5866 }, { "epoch": 0.11736, "grad_norm": 2.453125, "grad_norm_var": 0.030085245768229168, "learning_rate": 0.0001, "loss": 4.6434, "loss/crossentropy": 2.0929455161094666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24055248498916626, "step": 5868 }, { "epoch": 0.1174, "grad_norm": 2.46875, "grad_norm_var": 0.03168843587239583, "learning_rate": 0.0001, "loss": 4.5465, "loss/crossentropy": 2.1476733684539795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2416895627975464, "step": 5870 }, { "epoch": 0.11744, "grad_norm": 2.21875, "grad_norm_var": 0.027962239583333333, "learning_rate": 0.0001, "loss": 4.5633, "loss/crossentropy": 1.74330335855484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22001302987337112, "step": 5872 }, { "epoch": 0.11748, "grad_norm": 2.265625, "grad_norm_var": 0.021507771809895833, "learning_rate": 0.0001, "loss": 4.2725, "loss/crossentropy": 1.8903921246528625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21950766444206238, "step": 5874 }, { "epoch": 0.11752, "grad_norm": 2.140625, "grad_norm_var": 0.022069295247395832, "learning_rate": 0.0001, "loss": 4.3673, "loss/crossentropy": 1.798406720161438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20271167159080505, "step": 5876 }, { "epoch": 0.11756, "grad_norm": 2.25, "grad_norm_var": 0.021089680989583335, "learning_rate": 0.0001, "loss": 4.7128, "loss/crossentropy": 1.9651959538459778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21256517618894577, "step": 5878 }, { "epoch": 0.1176, "grad_norm": 2.109375, "grad_norm_var": 0.016844685872395834, "learning_rate": 0.0001, "loss": 4.4158, "loss/crossentropy": 2.039245307445526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23225219547748566, "step": 5880 }, { "epoch": 0.11764, "grad_norm": 2.40625, "grad_norm_var": 0.013329060872395833, "learning_rate": 0.0001, "loss": 4.8741, "loss/crossentropy": 2.500381350517273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31757834553718567, "step": 5882 }, { "epoch": 0.11768, "grad_norm": 2.59375, "grad_norm_var": 0.018195597330729167, "learning_rate": 0.0001, "loss": 4.6463, "loss/crossentropy": 2.0540305972099304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2474198415875435, "step": 5884 }, { "epoch": 0.11772, "grad_norm": 2.359375, "grad_norm_var": 0.016462198893229165, "learning_rate": 0.0001, "loss": 4.5047, "loss/crossentropy": 2.072624385356903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2482014298439026, "step": 5886 }, { "epoch": 0.11776, "grad_norm": 2.328125, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 4.4025, "loss/crossentropy": 2.153423309326172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.234448604285717, "step": 5888 }, { "epoch": 0.1178, "grad_norm": 2.5625, "grad_norm_var": 0.025325520833333334, "learning_rate": 0.0001, "loss": 4.2698, "loss/crossentropy": 1.5941627621650696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20982372760772705, "step": 5890 }, { "epoch": 0.11784, "grad_norm": 2.390625, "grad_norm_var": 0.025886027018229167, "learning_rate": 0.0001, "loss": 4.5168, "loss/crossentropy": 2.2858930826187134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24936140328645706, "step": 5892 }, { "epoch": 0.11788, "grad_norm": 2.296875, "grad_norm_var": 0.028076171875, "learning_rate": 0.0001, "loss": 4.536, "loss/crossentropy": 2.333559274673462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26152122020721436, "step": 5894 }, { "epoch": 0.11792, "grad_norm": 2.296875, "grad_norm_var": 0.027106730143229167, "learning_rate": 0.0001, "loss": 4.412, "loss/crossentropy": 2.134613037109375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24929091334342957, "step": 5896 }, { "epoch": 0.11796, "grad_norm": 2.296875, "grad_norm_var": 0.026423136393229168, "learning_rate": 0.0001, "loss": 4.4052, "loss/crossentropy": 2.179764688014984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24523546546697617, "step": 5898 }, { "epoch": 0.118, "grad_norm": 2.109375, "grad_norm_var": 0.0204742431640625, "learning_rate": 0.0001, "loss": 4.3431, "loss/crossentropy": 2.1184223294258118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21715932339429855, "step": 5900 }, { "epoch": 0.11804, "grad_norm": 2.078125, "grad_norm_var": 0.021882120768229166, "learning_rate": 0.0001, "loss": 4.5606, "loss/crossentropy": 2.024593770503998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23016374558210373, "step": 5902 }, { "epoch": 0.11808, "grad_norm": 2.265625, "grad_norm_var": 0.0204986572265625, "learning_rate": 0.0001, "loss": 4.3646, "loss/crossentropy": 2.077186107635498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2538782134652138, "step": 5904 }, { "epoch": 0.11812, "grad_norm": 2.359375, "grad_norm_var": 0.013190714518229167, "learning_rate": 0.0001, "loss": 4.9135, "loss/crossentropy": 2.2535945177078247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25025132298469543, "step": 5906 }, { "epoch": 0.11816, "grad_norm": 2.21875, "grad_norm_var": 0.01064453125, "learning_rate": 0.0001, "loss": 4.5512, "loss/crossentropy": 2.5321284532546997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2729681059718132, "step": 5908 }, { "epoch": 0.1182, "grad_norm": 2.234375, "grad_norm_var": 0.006403605143229167, "learning_rate": 0.0001, "loss": 4.332, "loss/crossentropy": 2.043885111808777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23721691966056824, "step": 5910 }, { "epoch": 0.11824, "grad_norm": 2.296875, "grad_norm_var": 0.005204264322916667, "learning_rate": 0.0001, "loss": 4.3901, "loss/crossentropy": 2.1343676447868347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23325734585523605, "step": 5912 }, { "epoch": 0.11828, "grad_norm": 2.03125, "grad_norm_var": 0.0074045817057291664, "learning_rate": 0.0001, "loss": 4.3551, "loss/crossentropy": 2.1164477467536926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24526448547840118, "step": 5914 }, { "epoch": 0.11832, "grad_norm": 2.1875, "grad_norm_var": 0.0064737955729166664, "learning_rate": 0.0001, "loss": 4.5162, "loss/crossentropy": 2.268216848373413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24837128818035126, "step": 5916 }, { "epoch": 0.11836, "grad_norm": 2.25, "grad_norm_var": 0.0057037353515625, "learning_rate": 0.0001, "loss": 4.4636, "loss/crossentropy": 2.074695885181427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24956009536981583, "step": 5918 }, { "epoch": 0.1184, "grad_norm": 2.203125, "grad_norm_var": 0.007372029622395833, "learning_rate": 0.0001, "loss": 4.3165, "loss/crossentropy": 1.9369722604751587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316487580537796, "step": 5920 }, { "epoch": 0.11844, "grad_norm": 2.15625, "grad_norm_var": 0.00611572265625, "learning_rate": 0.0001, "loss": 4.2789, "loss/crossentropy": 2.2189531326293945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23884039372205734, "step": 5922 }, { "epoch": 0.11848, "grad_norm": 2.34375, "grad_norm_var": 0.007323201497395833, "learning_rate": 0.0001, "loss": 4.5965, "loss/crossentropy": 2.3833028078079224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2504591718316078, "step": 5924 }, { "epoch": 0.11852, "grad_norm": 2.234375, "grad_norm_var": 0.007005818684895833, "learning_rate": 0.0001, "loss": 4.6346, "loss/crossentropy": 2.0443845987319946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23422807455062866, "step": 5926 }, { "epoch": 0.11856, "grad_norm": 2.203125, "grad_norm_var": 0.0073964436848958336, "learning_rate": 0.0001, "loss": 4.9061, "loss/crossentropy": 2.223625063896179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24985776841640472, "step": 5928 }, { "epoch": 0.1186, "grad_norm": 2.171875, "grad_norm_var": 0.005126953125, "learning_rate": 0.0001, "loss": 4.493, "loss/crossentropy": 2.1353545784950256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22846727818250656, "step": 5930 }, { "epoch": 0.11864, "grad_norm": 2.203125, "grad_norm_var": 0.005159505208333333, "learning_rate": 0.0001, "loss": 4.5786, "loss/crossentropy": 2.0497827529907227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21461189538240433, "step": 5932 }, { "epoch": 0.11868, "grad_norm": 2.28125, "grad_norm_var": 0.004715983072916667, "learning_rate": 0.0001, "loss": 4.5071, "loss/crossentropy": 1.9257362484931946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216043472290039, "step": 5934 }, { "epoch": 0.11872, "grad_norm": 2.171875, "grad_norm_var": 0.0034006754557291668, "learning_rate": 0.0001, "loss": 4.4212, "loss/crossentropy": 2.0458216071128845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23162957280874252, "step": 5936 }, { "epoch": 0.11876, "grad_norm": 2.1875, "grad_norm_var": 0.0033274332682291666, "learning_rate": 0.0001, "loss": 4.6109, "loss/crossentropy": 2.0786396861076355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22303655743598938, "step": 5938 }, { "epoch": 0.1188, "grad_norm": 2.546875, "grad_norm_var": 0.04327799479166667, "learning_rate": 0.0001, "loss": 4.4861, "loss/crossentropy": 1.9800568222999573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22236331552267075, "step": 5940 }, { "epoch": 0.11884, "grad_norm": 2.0625, "grad_norm_var": 0.0502349853515625, "learning_rate": 0.0001, "loss": 4.056, "loss/crossentropy": 1.882250189781189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21664132922887802, "step": 5942 }, { "epoch": 0.11888, "grad_norm": 2.203125, "grad_norm_var": 0.05025634765625, "learning_rate": 0.0001, "loss": 4.5965, "loss/crossentropy": 2.3682695627212524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2698971778154373, "step": 5944 }, { "epoch": 0.11892, "grad_norm": 2.265625, "grad_norm_var": 0.0505523681640625, "learning_rate": 0.0001, "loss": 4.6364, "loss/crossentropy": 2.225574493408203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2642917186021805, "step": 5946 }, { "epoch": 0.11896, "grad_norm": 2.1875, "grad_norm_var": 0.05032145182291667, "learning_rate": 0.0001, "loss": 4.3157, "loss/crossentropy": 1.8634169697761536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170402556657791, "step": 5948 }, { "epoch": 0.119, "grad_norm": 2.3125, "grad_norm_var": 0.05123291015625, "learning_rate": 0.0001, "loss": 5.0392, "loss/crossentropy": 2.4500025510787964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27845144271850586, "step": 5950 }, { "epoch": 0.11904, "grad_norm": 2.046875, "grad_norm_var": 0.054108683268229166, "learning_rate": 0.0001, "loss": 4.1179, "loss/crossentropy": 2.1532052755355835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24952176213264465, "step": 5952 }, { "epoch": 0.11908, "grad_norm": 2.140625, "grad_norm_var": 0.0561676025390625, "learning_rate": 0.0001, "loss": 4.2592, "loss/crossentropy": 2.066560387611389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356283888220787, "step": 5954 }, { "epoch": 0.11912, "grad_norm": 2.265625, "grad_norm_var": 0.01021728515625, "learning_rate": 0.0001, "loss": 4.6187, "loss/crossentropy": 1.9679089784622192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23729706555604935, "step": 5956 }, { "epoch": 0.11916, "grad_norm": 2.234375, "grad_norm_var": 0.0071441650390625, "learning_rate": 0.0001, "loss": 4.3463, "loss/crossentropy": 2.3490394353866577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26400282233953476, "step": 5958 }, { "epoch": 0.1192, "grad_norm": 2.296875, "grad_norm_var": 0.008576456705729167, "learning_rate": 0.0001, "loss": 4.3629, "loss/crossentropy": 2.145757555961609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23329483717679977, "step": 5960 }, { "epoch": 0.11924, "grad_norm": 2.1875, "grad_norm_var": 0.005399576822916667, "learning_rate": 0.0001, "loss": 4.2376, "loss/crossentropy": 2.0764617919921875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22371648252010345, "step": 5962 }, { "epoch": 0.11928, "grad_norm": 2.46875, "grad_norm_var": 0.010380045572916666, "learning_rate": 0.0001, "loss": 4.8106, "loss/crossentropy": 2.199389696121216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23210449516773224, "step": 5964 }, { "epoch": 0.11932, "grad_norm": 2.09375, "grad_norm_var": 0.01064453125, "learning_rate": 0.0001, "loss": 4.3515, "loss/crossentropy": 1.9008439183235168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21474837511777878, "step": 5966 }, { "epoch": 0.11936, "grad_norm": 2.0, "grad_norm_var": 0.0148590087890625, "learning_rate": 0.0001, "loss": 4.4603, "loss/crossentropy": 2.1779539585113525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23485051095485687, "step": 5968 }, { "epoch": 0.1194, "grad_norm": 2.15625, "grad_norm_var": 0.014176432291666667, "learning_rate": 0.0001, "loss": 4.428, "loss/crossentropy": 2.267147421836853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24924689531326294, "step": 5970 }, { "epoch": 0.11944, "grad_norm": 2.1875, "grad_norm_var": 0.016292317708333334, "learning_rate": 0.0001, "loss": 4.0856, "loss/crossentropy": 2.0618110299110413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24352984875440598, "step": 5972 }, { "epoch": 0.11948, "grad_norm": 2.34375, "grad_norm_var": 0.023758951822916666, "learning_rate": 0.0001, "loss": 4.5802, "loss/crossentropy": 2.1419676542282104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23680058121681213, "step": 5974 }, { "epoch": 0.11952, "grad_norm": 2.265625, "grad_norm_var": 0.023368326822916667, "learning_rate": 0.0001, "loss": 4.6554, "loss/crossentropy": 2.0376622080802917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22515031695365906, "step": 5976 }, { "epoch": 0.11956, "grad_norm": 2.46875, "grad_norm_var": 0.025739542643229165, "learning_rate": 0.0001, "loss": 4.4213, "loss/crossentropy": 1.7675965428352356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219834603369236, "step": 5978 }, { "epoch": 0.1196, "grad_norm": 2.1875, "grad_norm_var": 0.024030558268229165, "learning_rate": 0.0001, "loss": 4.2886, "loss/crossentropy": 1.8919905424118042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2256327122449875, "step": 5980 }, { "epoch": 0.11964, "grad_norm": 2.15625, "grad_norm_var": 0.0228424072265625, "learning_rate": 0.0001, "loss": 4.6148, "loss/crossentropy": 2.287980794906616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25571687519550323, "step": 5982 }, { "epoch": 0.11968, "grad_norm": 2.15625, "grad_norm_var": 0.0169830322265625, "learning_rate": 0.0001, "loss": 4.3527, "loss/crossentropy": 2.1030094027519226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23401429504156113, "step": 5984 }, { "epoch": 0.11972, "grad_norm": 2.421875, "grad_norm_var": 0.0166412353515625, "learning_rate": 0.0001, "loss": 4.8857, "loss/crossentropy": 2.469533920288086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2548582851886749, "step": 5986 }, { "epoch": 0.11976, "grad_norm": 2.328125, "grad_norm_var": 0.018047841389973958, "learning_rate": 0.0001, "loss": 4.167, "loss/crossentropy": 2.140324354171753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22603372484445572, "step": 5988 }, { "epoch": 0.1198, "grad_norm": 2.140625, "grad_norm_var": 0.015773264567057292, "learning_rate": 0.0001, "loss": 4.3848, "loss/crossentropy": 2.1848061084747314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22712922096252441, "step": 5990 }, { "epoch": 0.11984, "grad_norm": 2.34375, "grad_norm_var": 0.015380605061848959, "learning_rate": 0.0001, "loss": 4.6457, "loss/crossentropy": 2.2872358560562134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2352529615163803, "step": 5992 }, { "epoch": 0.11988, "grad_norm": 2.25, "grad_norm_var": 0.012318674723307292, "learning_rate": 0.0001, "loss": 4.6136, "loss/crossentropy": 2.082811713218689, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24107889831066132, "step": 5994 }, { "epoch": 0.11992, "grad_norm": 2.109375, "grad_norm_var": 0.013602447509765626, "learning_rate": 0.0001, "loss": 4.2089, "loss/crossentropy": 2.23664391040802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23429522663354874, "step": 5996 }, { "epoch": 0.11996, "grad_norm": 2.265625, "grad_norm_var": 0.013242340087890625, "learning_rate": 0.0001, "loss": 4.5237, "loss/crossentropy": 2.451270341873169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2541816979646683, "step": 5998 }, { "epoch": 0.12, "grad_norm": 2.109375, "grad_norm_var": 0.013561757405598958, "learning_rate": 0.0001, "loss": 4.5559, "loss/crossentropy": 2.1744157671928406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22349119931459427, "step": 6000 }, { "epoch": 0.12004, "grad_norm": 2.09375, "grad_norm_var": 0.013171132405598958, "learning_rate": 0.0001, "loss": 4.5532, "loss/crossentropy": 2.0316836833953857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.259635865688324, "step": 6002 }, { "epoch": 0.12008, "grad_norm": 2.609375, "grad_norm_var": 0.022337849934895834, "learning_rate": 0.0001, "loss": 4.3674, "loss/crossentropy": 2.0989437103271484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25056491047143936, "step": 6004 }, { "epoch": 0.12012, "grad_norm": 2.5625, "grad_norm_var": 0.02633056640625, "learning_rate": 0.0001, "loss": 4.0883, "loss/crossentropy": 1.9609100818634033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22969383746385574, "step": 6006 }, { "epoch": 0.12016, "grad_norm": 2.15625, "grad_norm_var": 0.027665201822916666, "learning_rate": 0.0001, "loss": 4.2094, "loss/crossentropy": 2.077883243560791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22850078344345093, "step": 6008 }, { "epoch": 0.1202, "grad_norm": 2.34375, "grad_norm_var": 1.6465779622395833, "learning_rate": 0.0001, "loss": 4.5659, "loss/crossentropy": 1.7967591285705566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126556932926178, "step": 6010 }, { "epoch": 0.12024, "grad_norm": 2.234375, "grad_norm_var": 1.62222900390625, "learning_rate": 0.0001, "loss": 4.4018, "loss/crossentropy": 1.6516226530075073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2188895046710968, "step": 6012 }, { "epoch": 0.12028, "grad_norm": 2.21875, "grad_norm_var": 1.62984619140625, "learning_rate": 0.0001, "loss": 4.3303, "loss/crossentropy": 2.161388635635376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23447516560554504, "step": 6014 }, { "epoch": 0.12032, "grad_norm": 2.328125, "grad_norm_var": 1.6235636393229167, "learning_rate": 0.0001, "loss": 4.5112, "loss/crossentropy": 1.9205461740493774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23797179013490677, "step": 6016 }, { "epoch": 0.12036, "grad_norm": 2.3125, "grad_norm_var": 1.616844685872396, "learning_rate": 0.0001, "loss": 4.8019, "loss/crossentropy": 2.025223135948181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25120896100997925, "step": 6018 }, { "epoch": 0.1204, "grad_norm": 2.25, "grad_norm_var": 1.6347005208333334, "learning_rate": 0.0001, "loss": 4.4819, "loss/crossentropy": 1.957942008972168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22228525578975677, "step": 6020 }, { "epoch": 0.12044, "grad_norm": 2.171875, "grad_norm_var": 1.674201456705729, "learning_rate": 0.0001, "loss": 4.6193, "loss/crossentropy": 2.3325445652008057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26250994950532913, "step": 6022 }, { "epoch": 0.12048, "grad_norm": 2.03125, "grad_norm_var": 1.6957997639973958, "learning_rate": 0.0001, "loss": 4.264, "loss/crossentropy": 2.0049667954444885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22990728169679642, "step": 6024 }, { "epoch": 0.12052, "grad_norm": 2.265625, "grad_norm_var": 0.08680013020833334, "learning_rate": 0.0001, "loss": 4.5996, "loss/crossentropy": 2.0473387241363525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23628919571638107, "step": 6026 }, { "epoch": 0.12056, "grad_norm": 1.9765625, "grad_norm_var": 0.09107640584309896, "learning_rate": 0.0001, "loss": 4.109, "loss/crossentropy": 2.013141930103302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21728236973285675, "step": 6028 }, { "epoch": 0.1206, "grad_norm": 2.109375, "grad_norm_var": 0.09145278930664062, "learning_rate": 0.0001, "loss": 4.3869, "loss/crossentropy": 2.1269132494926453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25139085948467255, "step": 6030 }, { "epoch": 0.12064, "grad_norm": 2.234375, "grad_norm_var": 0.09058405558268229, "learning_rate": 0.0001, "loss": 4.5568, "loss/crossentropy": 2.5267512798309326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27998340129852295, "step": 6032 }, { "epoch": 0.12068, "grad_norm": 2.78125, "grad_norm_var": 0.10746027628580729, "learning_rate": 0.0001, "loss": 4.2615, "loss/crossentropy": 1.8502249717712402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22863731533288956, "step": 6034 }, { "epoch": 0.12072, "grad_norm": 2.34375, "grad_norm_var": 0.10850601196289063, "learning_rate": 0.0001, "loss": 4.2197, "loss/crossentropy": 1.7754456400871277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23809141665697098, "step": 6036 }, { "epoch": 0.12076, "grad_norm": 2.203125, "grad_norm_var": 0.033760325113932295, "learning_rate": 0.0001, "loss": 4.4714, "loss/crossentropy": 1.9596920609474182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111937776207924, "step": 6038 }, { "epoch": 0.1208, "grad_norm": 2.203125, "grad_norm_var": 0.02995580037434896, "learning_rate": 0.0001, "loss": 4.3079, "loss/crossentropy": 1.888563334941864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21698038280010223, "step": 6040 }, { "epoch": 0.12084, "grad_norm": 2.296875, "grad_norm_var": 0.031404368082682294, "learning_rate": 0.0001, "loss": 4.2412, "loss/crossentropy": 2.1646993160247803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22735021263360977, "step": 6042 }, { "epoch": 0.12088, "grad_norm": 2.1875, "grad_norm_var": 0.0258941650390625, "learning_rate": 0.0001, "loss": 4.525, "loss/crossentropy": 2.0790343284606934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193024456501007, "step": 6044 }, { "epoch": 0.12092, "grad_norm": 2.109375, "grad_norm_var": 0.025211588541666666, "learning_rate": 0.0001, "loss": 4.3538, "loss/crossentropy": 2.2733768224716187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.249516561627388, "step": 6046 }, { "epoch": 0.12096, "grad_norm": 2.21875, "grad_norm_var": 0.02431640625, "learning_rate": 0.0001, "loss": 4.7286, "loss/crossentropy": 2.5003533363342285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22928690910339355, "step": 6048 }, { "epoch": 0.121, "grad_norm": 2.453125, "grad_norm_var": 0.03629150390625, "learning_rate": 0.0001, "loss": 4.6269, "loss/crossentropy": 1.9606900215148926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152085080742836, "step": 6050 }, { "epoch": 0.12104, "grad_norm": 2.390625, "grad_norm_var": 0.0384765625, "learning_rate": 0.0001, "loss": 4.5724, "loss/crossentropy": 2.266395926475525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26055608689785004, "step": 6052 }, { "epoch": 0.12108, "grad_norm": 2.296875, "grad_norm_var": 0.037821451822916664, "learning_rate": 0.0001, "loss": 4.5841, "loss/crossentropy": 2.1753041744232178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327822595834732, "step": 6054 }, { "epoch": 0.12112, "grad_norm": 2.15625, "grad_norm_var": 0.041304524739583334, "learning_rate": 0.0001, "loss": 4.2198, "loss/crossentropy": 1.775630235671997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073052078485489, "step": 6056 }, { "epoch": 0.12116, "grad_norm": 2.21875, "grad_norm_var": 0.03975321451822917, "learning_rate": 0.0001, "loss": 4.486, "loss/crossentropy": 2.415855050086975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.270721398293972, "step": 6058 }, { "epoch": 0.1212, "grad_norm": 2.34375, "grad_norm_var": 0.039453125, "learning_rate": 0.0001, "loss": 4.6647, "loss/crossentropy": 2.2122162580490112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24723708629608154, "step": 6060 }, { "epoch": 0.12124, "grad_norm": 2.375, "grad_norm_var": 0.041792805989583334, "learning_rate": 0.0001, "loss": 4.3803, "loss/crossentropy": 1.9540830850601196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21385761350393295, "step": 6062 }, { "epoch": 0.12128, "grad_norm": 2.34375, "grad_norm_var": 0.0413726806640625, "learning_rate": 0.0001, "loss": 4.979, "loss/crossentropy": 1.923313319683075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22986505925655365, "step": 6064 }, { "epoch": 0.12132, "grad_norm": 3.03125, "grad_norm_var": 0.05182291666666667, "learning_rate": 0.0001, "loss": 4.6895, "loss/crossentropy": 2.4042444229125977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30044034123420715, "step": 6066 }, { "epoch": 0.12136, "grad_norm": 2.25, "grad_norm_var": 0.049225870768229166, "learning_rate": 0.0001, "loss": 4.5857, "loss/crossentropy": 2.2610549926757812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2628016769886017, "step": 6068 }, { "epoch": 0.1214, "grad_norm": 2.140625, "grad_norm_var": 0.0513671875, "learning_rate": 0.0001, "loss": 4.5024, "loss/crossentropy": 1.9100797176361084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22663169354200363, "step": 6070 }, { "epoch": 0.12144, "grad_norm": 2.34375, "grad_norm_var": 0.04830729166666667, "learning_rate": 0.0001, "loss": 4.5916, "loss/crossentropy": 2.3971651792526245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26831263303756714, "step": 6072 }, { "epoch": 0.12148, "grad_norm": 2.453125, "grad_norm_var": 0.0492095947265625, "learning_rate": 0.0001, "loss": 4.4553, "loss/crossentropy": 2.106821596622467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23786011338233948, "step": 6074 }, { "epoch": 0.12152, "grad_norm": 2.125, "grad_norm_var": 0.052469889322916664, "learning_rate": 0.0001, "loss": 4.4225, "loss/crossentropy": 2.1920535564422607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23812127113342285, "step": 6076 }, { "epoch": 0.12156, "grad_norm": 2.203125, "grad_norm_var": 0.049088541666666666, "learning_rate": 0.0001, "loss": 4.2314, "loss/crossentropy": 2.3014419078826904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2380141019821167, "step": 6078 }, { "epoch": 0.1216, "grad_norm": 2.265625, "grad_norm_var": 0.04903055826822917, "learning_rate": 0.0001, "loss": 4.4558, "loss/crossentropy": 2.1721781492233276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24158813059329987, "step": 6080 }, { "epoch": 0.12164, "grad_norm": 2.21875, "grad_norm_var": 0.013232421875, "learning_rate": 0.0001, "loss": 4.5192, "loss/crossentropy": 2.1230934858322144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23764102160930634, "step": 6082 }, { "epoch": 0.12168, "grad_norm": 2.296875, "grad_norm_var": 0.0166412353515625, "learning_rate": 0.0001, "loss": 4.2147, "loss/crossentropy": 1.9570311307907104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24687693268060684, "step": 6084 }, { "epoch": 0.12172, "grad_norm": 2.15625, "grad_norm_var": 0.014997355143229167, "learning_rate": 0.0001, "loss": 4.2063, "loss/crossentropy": 2.272383213043213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2567671462893486, "step": 6086 }, { "epoch": 0.12176, "grad_norm": 2.1875, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.645, "loss/crossentropy": 2.216492176055908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24377377331256866, "step": 6088 }, { "epoch": 0.1218, "grad_norm": 2.140625, "grad_norm_var": 0.007201131184895833, "learning_rate": 0.0001, "loss": 4.5956, "loss/crossentropy": 2.25177001953125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23176074773073196, "step": 6090 }, { "epoch": 0.12184, "grad_norm": 2.21875, "grad_norm_var": 0.009098307291666666, "learning_rate": 0.0001, "loss": 4.1966, "loss/crossentropy": 2.2452452182769775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24528006464242935, "step": 6092 }, { "epoch": 0.12188, "grad_norm": 2.25, "grad_norm_var": 0.0091949462890625, "learning_rate": 0.0001, "loss": 4.3548, "loss/crossentropy": 2.078445553779602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2276337668299675, "step": 6094 }, { "epoch": 0.12192, "grad_norm": 2.125, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 4.4435, "loss/crossentropy": 2.2938032150268555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2784377336502075, "step": 6096 }, { "epoch": 0.12196, "grad_norm": 2.015625, "grad_norm_var": 0.0136627197265625, "learning_rate": 0.0001, "loss": 4.1588, "loss/crossentropy": 2.028180480003357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23716723918914795, "step": 6098 }, { "epoch": 0.122, "grad_norm": 2.609375, "grad_norm_var": 0.020580037434895834, "learning_rate": 0.0001, "loss": 4.6345, "loss/crossentropy": 2.4959323406219482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2510451450943947, "step": 6100 }, { "epoch": 0.12204, "grad_norm": 2.21875, "grad_norm_var": 0.023856608072916667, "learning_rate": 0.0001, "loss": 4.6639, "loss/crossentropy": 2.186043620109558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23511512577533722, "step": 6102 }, { "epoch": 0.12208, "grad_norm": 2.359375, "grad_norm_var": 0.024442545572916665, "learning_rate": 0.0001, "loss": 4.6727, "loss/crossentropy": 2.6631078720092773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25689610838890076, "step": 6104 }, { "epoch": 0.12212, "grad_norm": 2.453125, "grad_norm_var": 0.025609334309895832, "learning_rate": 0.0001, "loss": 4.7943, "loss/crossentropy": 2.310486674308777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26937828958034515, "step": 6106 }, { "epoch": 0.12216, "grad_norm": 2.359375, "grad_norm_var": 0.0247711181640625, "learning_rate": 0.0001, "loss": 4.9892, "loss/crossentropy": 2.2036240100860596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24968907237052917, "step": 6108 }, { "epoch": 0.1222, "grad_norm": 2.984375, "grad_norm_var": 0.058251953125, "learning_rate": 0.0001, "loss": 4.3527, "loss/crossentropy": 1.9434874057769775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23247701674699783, "step": 6110 }, { "epoch": 0.12224, "grad_norm": 2.296875, "grad_norm_var": 0.05579020182291667, "learning_rate": 0.0001, "loss": 4.4149, "loss/crossentropy": 2.0525330305099487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24810528755187988, "step": 6112 }, { "epoch": 0.12228, "grad_norm": 2.328125, "grad_norm_var": 0.04482320149739583, "learning_rate": 0.0001, "loss": 4.5205, "loss/crossentropy": 2.0085532665252686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2514277398586273, "step": 6114 }, { "epoch": 0.12232, "grad_norm": 2.328125, "grad_norm_var": 0.04120686848958333, "learning_rate": 0.0001, "loss": 4.5745, "loss/crossentropy": 1.7393967509269714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239851951599121, "step": 6116 }, { "epoch": 0.12236, "grad_norm": 2.546875, "grad_norm_var": 0.04327799479166667, "learning_rate": 0.0001, "loss": 4.7332, "loss/crossentropy": 1.8714343905448914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22823140025138855, "step": 6118 }, { "epoch": 0.1224, "grad_norm": 2.3125, "grad_norm_var": 0.04192708333333333, "learning_rate": 0.0001, "loss": 4.6177, "loss/crossentropy": 1.9353562593460083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22261886298656464, "step": 6120 }, { "epoch": 0.12244, "grad_norm": 2.171875, "grad_norm_var": 0.0469635009765625, "learning_rate": 0.0001, "loss": 4.3154, "loss/crossentropy": 2.0409420132637024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2319282591342926, "step": 6122 }, { "epoch": 0.12248, "grad_norm": 2.125, "grad_norm_var": 0.04882405598958333, "learning_rate": 0.0001, "loss": 4.4764, "loss/crossentropy": 2.2461307048797607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25047267973423004, "step": 6124 }, { "epoch": 0.12252, "grad_norm": 2.203125, "grad_norm_var": 0.014012654622395834, "learning_rate": 0.0001, "loss": 4.4863, "loss/crossentropy": 2.1060246229171753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24702349305152893, "step": 6126 }, { "epoch": 0.12256, "grad_norm": 2.21875, "grad_norm_var": 0.01718724568684896, "learning_rate": 0.0001, "loss": 3.9093, "loss/crossentropy": 1.7481068968772888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19611438363790512, "step": 6128 }, { "epoch": 0.1226, "grad_norm": 2.21875, "grad_norm_var": 0.017329661051432292, "learning_rate": 0.0001, "loss": 4.5506, "loss/crossentropy": 1.8341861963272095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20660528540611267, "step": 6130 }, { "epoch": 0.12264, "grad_norm": 2.421875, "grad_norm_var": 0.018507639567057293, "learning_rate": 0.0001, "loss": 4.3388, "loss/crossentropy": 1.7332024574279785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21615543216466904, "step": 6132 }, { "epoch": 0.12268, "grad_norm": 2.078125, "grad_norm_var": 0.013203684488932292, "learning_rate": 0.0001, "loss": 4.3027, "loss/crossentropy": 1.8959643244743347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219951793551445, "step": 6134 }, { "epoch": 0.12272, "grad_norm": 2.265625, "grad_norm_var": 0.012601470947265625, "learning_rate": 0.0001, "loss": 4.6945, "loss/crossentropy": 2.234723210334778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2342153787612915, "step": 6136 }, { "epoch": 0.12276, "grad_norm": 2.3125, "grad_norm_var": 0.011637115478515625, "learning_rate": 0.0001, "loss": 4.7394, "loss/crossentropy": 2.2297592759132385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24097590148448944, "step": 6138 }, { "epoch": 0.1228, "grad_norm": 2.40625, "grad_norm_var": 0.013586171468098958, "learning_rate": 0.0001, "loss": 4.592, "loss/crossentropy": 2.013442814350128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532622739672661, "step": 6140 }, { "epoch": 0.12284, "grad_norm": 2.15625, "grad_norm_var": 0.014833323160807292, "learning_rate": 0.0001, "loss": 4.417, "loss/crossentropy": 2.3066656589508057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24480555206537247, "step": 6142 }, { "epoch": 0.12288, "grad_norm": 2.4375, "grad_norm_var": 0.014241536458333334, "learning_rate": 0.0001, "loss": 4.4156, "loss/crossentropy": 1.9281310439109802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22363708168268204, "step": 6144 }, { "epoch": 0.12292, "grad_norm": 2.078125, "grad_norm_var": 0.016792805989583333, "learning_rate": 0.0001, "loss": 4.4314, "loss/crossentropy": 1.965875267982483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22153983265161514, "step": 6146 }, { "epoch": 0.12296, "grad_norm": 2.8125, "grad_norm_var": 0.03536783854166667, "learning_rate": 0.0001, "loss": 4.6139, "loss/crossentropy": 1.811126947402954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21779045462608337, "step": 6148 }, { "epoch": 0.123, "grad_norm": 2.34375, "grad_norm_var": 0.0311676025390625, "learning_rate": 0.0001, "loss": 4.4855, "loss/crossentropy": 2.5114762783050537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2542252689599991, "step": 6150 }, { "epoch": 0.12304, "grad_norm": 2.296875, "grad_norm_var": 0.028271484375, "learning_rate": 0.0001, "loss": 4.1269, "loss/crossentropy": 1.8784565925598145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24637068808078766, "step": 6152 }, { "epoch": 0.12308, "grad_norm": 2.234375, "grad_norm_var": 0.028685506184895834, "learning_rate": 0.0001, "loss": 4.3377, "loss/crossentropy": 1.6900760531425476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170872688293457, "step": 6154 }, { "epoch": 0.12312, "grad_norm": 2.21875, "grad_norm_var": 0.02867431640625, "learning_rate": 0.0001, "loss": 4.5705, "loss/crossentropy": 2.413783550262451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2624947279691696, "step": 6156 }, { "epoch": 0.12316, "grad_norm": 2.03125, "grad_norm_var": 0.030126953125, "learning_rate": 0.0001, "loss": 4.3463, "loss/crossentropy": 2.0598058104515076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071695551276207, "step": 6158 }, { "epoch": 0.1232, "grad_norm": 2.375, "grad_norm_var": 0.029781087239583334, "learning_rate": 0.0001, "loss": 4.6702, "loss/crossentropy": 2.0407246947288513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24353720247745514, "step": 6160 }, { "epoch": 0.12324, "grad_norm": 2.171875, "grad_norm_var": 0.028450520833333333, "learning_rate": 0.0001, "loss": 4.3116, "loss/crossentropy": 1.9608840346336365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20965111255645752, "step": 6162 }, { "epoch": 0.12328, "grad_norm": 2.046875, "grad_norm_var": 0.01640625, "learning_rate": 0.0001, "loss": 4.2146, "loss/crossentropy": 2.0231454372406006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219897098839283, "step": 6164 }, { "epoch": 0.12332, "grad_norm": 2.25, "grad_norm_var": 0.015999348958333333, "learning_rate": 0.0001, "loss": 4.614, "loss/crossentropy": 2.217663288116455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2466331645846367, "step": 6166 }, { "epoch": 0.12336, "grad_norm": 2.234375, "grad_norm_var": 0.01597900390625, "learning_rate": 0.0001, "loss": 4.4135, "loss/crossentropy": 1.6825732588768005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1999632865190506, "step": 6168 }, { "epoch": 0.1234, "grad_norm": 2.5625, "grad_norm_var": 0.020897420247395833, "learning_rate": 0.0001, "loss": 4.4552, "loss/crossentropy": 2.288950800895691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26286834478378296, "step": 6170 }, { "epoch": 0.12344, "grad_norm": 2.453125, "grad_norm_var": 0.023225911458333335, "learning_rate": 0.0001, "loss": 4.3376, "loss/crossentropy": 1.8202561140060425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22252517193555832, "step": 6172 }, { "epoch": 0.12348, "grad_norm": 2.3125, "grad_norm_var": 0.019710286458333334, "learning_rate": 0.0001, "loss": 4.5023, "loss/crossentropy": 2.1007159948349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24983422458171844, "step": 6174 }, { "epoch": 0.12352, "grad_norm": 2.171875, "grad_norm_var": 0.0199371337890625, "learning_rate": 0.0001, "loss": 4.4513, "loss/crossentropy": 2.0953084230422974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25545743107795715, "step": 6176 }, { "epoch": 0.12356, "grad_norm": 2.171875, "grad_norm_var": 0.019001261393229166, "learning_rate": 0.0001, "loss": 4.4315, "loss/crossentropy": 2.107246518135071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22741339355707169, "step": 6178 }, { "epoch": 0.1236, "grad_norm": 2.265625, "grad_norm_var": 0.013118489583333334, "learning_rate": 0.0001, "loss": 4.4982, "loss/crossentropy": 2.1219626665115356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22499094158411026, "step": 6180 }, { "epoch": 0.12364, "grad_norm": 2.25, "grad_norm_var": 0.013093058268229167, "learning_rate": 0.0001, "loss": 4.4263, "loss/crossentropy": 1.8892266154289246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2255062311887741, "step": 6182 }, { "epoch": 0.12368, "grad_norm": 2.21875, "grad_norm_var": 0.015315755208333334, "learning_rate": 0.0001, "loss": 4.5884, "loss/crossentropy": 2.0765860080718994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24712087213993073, "step": 6184 }, { "epoch": 0.12372, "grad_norm": 2.296875, "grad_norm_var": 0.010074869791666666, "learning_rate": 0.0001, "loss": 4.5431, "loss/crossentropy": 2.301337718963623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24419523775577545, "step": 6186 }, { "epoch": 0.12376, "grad_norm": 2.359375, "grad_norm_var": 0.008373006184895834, "learning_rate": 0.0001, "loss": 4.5043, "loss/crossentropy": 2.0489712953567505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25768278539180756, "step": 6188 }, { "epoch": 0.1238, "grad_norm": 2.140625, "grad_norm_var": 0.0128082275390625, "learning_rate": 0.0001, "loss": 4.2654, "loss/crossentropy": 1.9328826069831848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22916094958782196, "step": 6190 }, { "epoch": 0.12384, "grad_norm": 2.25, "grad_norm_var": 0.01171875, "learning_rate": 0.0001, "loss": 4.1574, "loss/crossentropy": 1.6330446004867554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20600398629903793, "step": 6192 }, { "epoch": 0.12388, "grad_norm": 2.234375, "grad_norm_var": 0.0111724853515625, "learning_rate": 0.0001, "loss": 4.4769, "loss/crossentropy": 1.988203227519989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22711507230997086, "step": 6194 }, { "epoch": 0.12392, "grad_norm": 1.9609375, "grad_norm_var": 0.01590143839518229, "learning_rate": 0.0001, "loss": 4.125, "loss/crossentropy": 2.036049246788025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110549360513687, "step": 6196 }, { "epoch": 0.12396, "grad_norm": 2.265625, "grad_norm_var": 0.01624120076497396, "learning_rate": 0.0001, "loss": 4.5546, "loss/crossentropy": 2.217681884765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23705793917179108, "step": 6198 }, { "epoch": 0.124, "grad_norm": 2.234375, "grad_norm_var": 0.015276845296223958, "learning_rate": 0.0001, "loss": 4.4803, "loss/crossentropy": 2.3878824710845947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24000737071037292, "step": 6200 }, { "epoch": 0.12404, "grad_norm": 2.265625, "grad_norm_var": 0.015852610270182293, "learning_rate": 0.0001, "loss": 4.4754, "loss/crossentropy": 2.2790249586105347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26150786131620407, "step": 6202 }, { "epoch": 0.12408, "grad_norm": 2.1875, "grad_norm_var": 0.015036773681640626, "learning_rate": 0.0001, "loss": 4.4703, "loss/crossentropy": 2.251123785972595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25231797993183136, "step": 6204 }, { "epoch": 0.12412, "grad_norm": 2.171875, "grad_norm_var": 0.010802968343098959, "learning_rate": 0.0001, "loss": 4.5294, "loss/crossentropy": 1.8977670073509216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21783004701137543, "step": 6206 }, { "epoch": 0.12416, "grad_norm": 2.265625, "grad_norm_var": 0.012894439697265624, "learning_rate": 0.0001, "loss": 4.6458, "loss/crossentropy": 2.385319232940674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29223111271858215, "step": 6208 }, { "epoch": 0.1242, "grad_norm": 2.40625, "grad_norm_var": 0.015964508056640625, "learning_rate": 0.0001, "loss": 4.6555, "loss/crossentropy": 1.9274529218673706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22827968001365662, "step": 6210 }, { "epoch": 0.12424, "grad_norm": 2.40625, "grad_norm_var": 0.01226806640625, "learning_rate": 0.0001, "loss": 4.8232, "loss/crossentropy": 2.1861478090286255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2424854040145874, "step": 6212 }, { "epoch": 0.12428, "grad_norm": 2.171875, "grad_norm_var": 0.01207275390625, "learning_rate": 0.0001, "loss": 4.3002, "loss/crossentropy": 2.2234357595443726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2471691370010376, "step": 6214 }, { "epoch": 0.12432, "grad_norm": 2.625, "grad_norm_var": 0.022541300455729166, "learning_rate": 0.0001, "loss": 4.5216, "loss/crossentropy": 2.365513563156128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2640291824936867, "step": 6216 }, { "epoch": 0.12436, "grad_norm": 2.21875, "grad_norm_var": 0.021955362955729165, "learning_rate": 0.0001, "loss": 4.282, "loss/crossentropy": 1.964316964149475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22341662645339966, "step": 6218 }, { "epoch": 0.1244, "grad_norm": 2.3125, "grad_norm_var": 0.021805826822916666, "learning_rate": 0.0001, "loss": 4.7078, "loss/crossentropy": 2.3704408407211304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24599966406822205, "step": 6220 }, { "epoch": 0.12444, "grad_norm": 2.1875, "grad_norm_var": 0.0193756103515625, "learning_rate": 0.0001, "loss": 4.7558, "loss/crossentropy": 2.1461408138275146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25205330550670624, "step": 6222 }, { "epoch": 0.12448, "grad_norm": 2.21875, "grad_norm_var": 0.018745930989583333, "learning_rate": 0.0001, "loss": 4.7032, "loss/crossentropy": 2.3997104167938232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23371660709381104, "step": 6224 }, { "epoch": 0.12452, "grad_norm": 2.09375, "grad_norm_var": 0.0212890625, "learning_rate": 0.0001, "loss": 4.1842, "loss/crossentropy": 1.7976875305175781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21662414073944092, "step": 6226 }, { "epoch": 0.12456, "grad_norm": 2.140625, "grad_norm_var": 0.02008056640625, "learning_rate": 0.0001, "loss": 4.2214, "loss/crossentropy": 1.9929583668708801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21378497034311295, "step": 6228 }, { "epoch": 0.1246, "grad_norm": 2.21875, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 4.2982, "loss/crossentropy": 1.8853623867034912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21267645806074142, "step": 6230 }, { "epoch": 0.12464, "grad_norm": 2.234375, "grad_norm_var": 0.008454386393229167, "learning_rate": 0.0001, "loss": 4.2638, "loss/crossentropy": 2.2534812688827515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23003943264484406, "step": 6232 }, { "epoch": 0.12468, "grad_norm": 2.0, "grad_norm_var": 0.010863240559895833, "learning_rate": 0.0001, "loss": 4.2504, "loss/crossentropy": 2.026564121246338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172931283712387, "step": 6234 }, { "epoch": 0.12472, "grad_norm": 2.25, "grad_norm_var": 0.01021728515625, "learning_rate": 0.0001, "loss": 4.3555, "loss/crossentropy": 1.9323118925094604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22170638293027878, "step": 6236 }, { "epoch": 0.12476, "grad_norm": 2.296875, "grad_norm_var": 0.007938639322916666, "learning_rate": 0.0001, "loss": 4.479, "loss/crossentropy": 2.056011915206909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2656880244612694, "step": 6238 }, { "epoch": 0.1248, "grad_norm": 2.296875, "grad_norm_var": 0.00830078125, "learning_rate": 0.0001, "loss": 4.6288, "loss/crossentropy": 2.095108926296234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23341026157140732, "step": 6240 }, { "epoch": 0.12484, "grad_norm": 2.4375, "grad_norm_var": 0.011555989583333334, "learning_rate": 0.0001, "loss": 4.737, "loss/crossentropy": 2.0198334455490112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24095112830400467, "step": 6242 }, { "epoch": 0.12488, "grad_norm": 2.09375, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 4.3745, "loss/crossentropy": 2.0397544503211975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22567399591207504, "step": 6244 }, { "epoch": 0.12492, "grad_norm": 2.1875, "grad_norm_var": 0.0120513916015625, "learning_rate": 0.0001, "loss": 4.3633, "loss/crossentropy": 1.9094319343566895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22690637409687042, "step": 6246 }, { "epoch": 0.12496, "grad_norm": 2.265625, "grad_norm_var": 0.011295572916666666, "learning_rate": 0.0001, "loss": 4.6474, "loss/crossentropy": 2.27765429019928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25075703859329224, "step": 6248 }, { "epoch": 0.125, "grad_norm": 2.609375, "grad_norm_var": 0.017023722330729168, "learning_rate": 0.0001, "loss": 4.5769, "loss/crossentropy": 2.0027456283569336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24267998337745667, "step": 6250 }, { "epoch": 0.12504, "grad_norm": 2.359375, "grad_norm_var": 0.017723592122395833, "learning_rate": 0.0001, "loss": 4.5398, "loss/crossentropy": 2.276857614517212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24048637598752975, "step": 6252 }, { "epoch": 0.12508, "grad_norm": 2.390625, "grad_norm_var": 0.018065388997395834, "learning_rate": 0.0001, "loss": 4.7603, "loss/crossentropy": 2.1234214305877686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23481453210115433, "step": 6254 }, { "epoch": 0.12512, "grad_norm": 2.28125, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 4.7537, "loss/crossentropy": 2.201158881187439, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23473594337701797, "step": 6256 }, { "epoch": 0.12516, "grad_norm": 2.296875, "grad_norm_var": 0.017220052083333333, "learning_rate": 0.0001, "loss": 4.7202, "loss/crossentropy": 2.3437804579734802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24881581962108612, "step": 6258 }, { "epoch": 0.1252, "grad_norm": 2.265625, "grad_norm_var": 0.015461222330729166, "learning_rate": 0.0001, "loss": 4.5938, "loss/crossentropy": 2.0984586477279663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24332843720912933, "step": 6260 }, { "epoch": 0.12524, "grad_norm": 2.390625, "grad_norm_var": 0.016942342122395832, "learning_rate": 0.0001, "loss": 4.2317, "loss/crossentropy": 1.7946080565452576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21510492265224457, "step": 6262 }, { "epoch": 0.12528, "grad_norm": 2.171875, "grad_norm_var": 0.016161092122395835, "learning_rate": 0.0001, "loss": 4.3889, "loss/crossentropy": 2.022711932659149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22890903800725937, "step": 6264 }, { "epoch": 0.12532, "grad_norm": 2.046875, "grad_norm_var": 0.009577433268229166, "learning_rate": 0.0001, "loss": 4.288, "loss/crossentropy": 1.9752087593078613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21972095221281052, "step": 6266 }, { "epoch": 0.12536, "grad_norm": 2.234375, "grad_norm_var": 0.008625284830729166, "learning_rate": 0.0001, "loss": 4.4952, "loss/crossentropy": 1.7267251014709473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21305006742477417, "step": 6268 }, { "epoch": 0.1254, "grad_norm": 2.109375, "grad_norm_var": 0.008869425455729166, "learning_rate": 0.0001, "loss": 4.2164, "loss/crossentropy": 2.249786615371704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24875187873840332, "step": 6270 }, { "epoch": 0.12544, "grad_norm": 2.46875, "grad_norm_var": 0.012190755208333333, "learning_rate": 0.0001, "loss": 4.693, "loss/crossentropy": 2.402994990348816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25536587834358215, "step": 6272 }, { "epoch": 0.12548, "grad_norm": 2.46875, "grad_norm_var": 0.016243489583333333, "learning_rate": 0.0001, "loss": 4.606, "loss/crossentropy": 2.240913987159729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3013365715742111, "step": 6274 }, { "epoch": 0.12552, "grad_norm": 2.390625, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 4.4691, "loss/crossentropy": 2.0767332911491394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22113215178251266, "step": 6276 }, { "epoch": 0.12556, "grad_norm": 2.140625, "grad_norm_var": 0.02041015625, "learning_rate": 0.0001, "loss": 4.608, "loss/crossentropy": 1.8625048995018005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20321352779865265, "step": 6278 }, { "epoch": 0.1256, "grad_norm": 2.3125, "grad_norm_var": 0.024072265625, "learning_rate": 0.0001, "loss": 4.0386, "loss/crossentropy": 1.9143638610839844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22085773944854736, "step": 6280 }, { "epoch": 0.12564, "grad_norm": 2.34375, "grad_norm_var": 0.023192342122395834, "learning_rate": 0.0001, "loss": 4.372, "loss/crossentropy": 2.3756210803985596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26027245819568634, "step": 6282 }, { "epoch": 0.12568, "grad_norm": 2.21875, "grad_norm_var": 0.02427978515625, "learning_rate": 0.0001, "loss": 4.5526, "loss/crossentropy": 1.9310896396636963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2086925357580185, "step": 6284 }, { "epoch": 0.12572, "grad_norm": 2.34375, "grad_norm_var": 0.021675618489583333, "learning_rate": 0.0001, "loss": 4.4751, "loss/crossentropy": 2.1757423877716064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.241075336933136, "step": 6286 }, { "epoch": 0.12576, "grad_norm": 2.265625, "grad_norm_var": 0.021284993489583334, "learning_rate": 0.0001, "loss": 4.0946, "loss/crossentropy": 2.3057546615600586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24882248044013977, "step": 6288 }, { "epoch": 0.1258, "grad_norm": 3.140625, "grad_norm_var": 0.06883036295572917, "learning_rate": 0.0001, "loss": 4.5211, "loss/crossentropy": 2.1551633477211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2796258181333542, "step": 6290 }, { "epoch": 0.12584, "grad_norm": 2.296875, "grad_norm_var": 0.06608072916666667, "learning_rate": 0.0001, "loss": 4.4915, "loss/crossentropy": 2.0627459287643433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24781616777181625, "step": 6292 }, { "epoch": 0.12588, "grad_norm": 2.453125, "grad_norm_var": 0.06463216145833334, "learning_rate": 0.0001, "loss": 4.5896, "loss/crossentropy": 1.795321524143219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113223671913147, "step": 6294 }, { "epoch": 0.12592, "grad_norm": 2.171875, "grad_norm_var": 0.06004130045572917, "learning_rate": 0.0001, "loss": 4.4988, "loss/crossentropy": 2.323319673538208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24748852849006653, "step": 6296 }, { "epoch": 0.12596, "grad_norm": 2.296875, "grad_norm_var": 0.05998942057291667, "learning_rate": 0.0001, "loss": 4.4801, "loss/crossentropy": 2.053200662136078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278411090373993, "step": 6298 }, { "epoch": 0.126, "grad_norm": 2.4375, "grad_norm_var": 0.060155232747395836, "learning_rate": 0.0001, "loss": 4.4605, "loss/crossentropy": 2.102539896965027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26071713864803314, "step": 6300 }, { "epoch": 0.12604, "grad_norm": 2.265625, "grad_norm_var": 0.059554036458333334, "learning_rate": 0.0001, "loss": 4.4613, "loss/crossentropy": 1.9316620826721191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23007915914058685, "step": 6302 }, { "epoch": 0.12608, "grad_norm": 2.6875, "grad_norm_var": 0.06288655598958333, "learning_rate": 0.0001, "loss": 4.4194, "loss/crossentropy": 1.9966526627540588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2424854189157486, "step": 6304 }, { "epoch": 0.12612, "grad_norm": 2.046875, "grad_norm_var": 0.022541300455729166, "learning_rate": 0.0001, "loss": 4.2382, "loss/crossentropy": 1.895868957042694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2255951091647148, "step": 6306 }, { "epoch": 0.12616, "grad_norm": 2.171875, "grad_norm_var": 0.023688761393229167, "learning_rate": 0.0001, "loss": 4.7758, "loss/crossentropy": 2.3897345066070557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24461720883846283, "step": 6308 }, { "epoch": 0.1262, "grad_norm": 2.40625, "grad_norm_var": 0.022858683268229166, "learning_rate": 0.0001, "loss": 4.8275, "loss/crossentropy": 2.059292197227478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29264035820961, "step": 6310 }, { "epoch": 0.12624, "grad_norm": 2.390625, "grad_norm_var": 0.0221343994140625, "learning_rate": 0.0001, "loss": 4.5098, "loss/crossentropy": 1.8142406344413757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22127485275268555, "step": 6312 }, { "epoch": 0.12628, "grad_norm": 2.109375, "grad_norm_var": 0.0245025634765625, "learning_rate": 0.0001, "loss": 4.2349, "loss/crossentropy": 2.0186127424240112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217330664396286, "step": 6314 }, { "epoch": 0.12632, "grad_norm": 2.265625, "grad_norm_var": 0.024820963541666668, "learning_rate": 0.0001, "loss": 4.0102, "loss/crossentropy": 1.9439889192581177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108919695019722, "step": 6316 }, { "epoch": 0.12636, "grad_norm": 2.28125, "grad_norm_var": 0.024690755208333335, "learning_rate": 0.0001, "loss": 4.8672, "loss/crossentropy": 2.617791175842285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2756097912788391, "step": 6318 }, { "epoch": 0.1264, "grad_norm": 2.515625, "grad_norm_var": 0.017609659830729166, "learning_rate": 0.0001, "loss": 4.3918, "loss/crossentropy": 1.9074286818504333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22491587698459625, "step": 6320 }, { "epoch": 0.12644, "grad_norm": 2.09375, "grad_norm_var": 0.016331990559895832, "learning_rate": 0.0001, "loss": 4.4304, "loss/crossentropy": 2.1414809226989746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22703612595796585, "step": 6322 }, { "epoch": 0.12648, "grad_norm": 2.28125, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 4.4982, "loss/crossentropy": 2.291784942150116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24490328133106232, "step": 6324 }, { "epoch": 0.12652, "grad_norm": 2.1875, "grad_norm_var": 0.0143218994140625, "learning_rate": 0.0001, "loss": 4.228, "loss/crossentropy": 1.938249409198761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22611317038536072, "step": 6326 }, { "epoch": 0.12656, "grad_norm": 2.03125, "grad_norm_var": 0.0187408447265625, "learning_rate": 0.0001, "loss": 4.4932, "loss/crossentropy": 1.7510024905204773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30439992994070053, "step": 6328 }, { "epoch": 0.1266, "grad_norm": 2.46875, "grad_norm_var": 0.022907511393229166, "learning_rate": 0.0001, "loss": 4.4351, "loss/crossentropy": 2.3168221712112427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25660980492830276, "step": 6330 }, { "epoch": 0.12664, "grad_norm": 2.046875, "grad_norm_var": 0.024409993489583334, "learning_rate": 0.0001, "loss": 4.5002, "loss/crossentropy": 1.9957427978515625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21420849114656448, "step": 6332 }, { "epoch": 0.12668, "grad_norm": 2.0625, "grad_norm_var": 0.025104777018229166, "learning_rate": 0.0001, "loss": 4.3583, "loss/crossentropy": 2.4371464252471924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2576342821121216, "step": 6334 }, { "epoch": 0.12672, "grad_norm": 2.171875, "grad_norm_var": 0.021458943684895832, "learning_rate": 0.0001, "loss": 4.2497, "loss/crossentropy": 1.910677433013916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21360614150762558, "step": 6336 }, { "epoch": 0.12676, "grad_norm": 2.203125, "grad_norm_var": 0.020750935872395834, "learning_rate": 0.0001, "loss": 4.2679, "loss/crossentropy": 2.057362914085388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23079442232847214, "step": 6338 }, { "epoch": 0.1268, "grad_norm": 2.21875, "grad_norm_var": 0.0191070556640625, "learning_rate": 0.0001, "loss": 4.1818, "loss/crossentropy": 2.0249438881874084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24038050323724747, "step": 6340 }, { "epoch": 0.12684, "grad_norm": 2.25, "grad_norm_var": 0.019074503580729166, "learning_rate": 0.0001, "loss": 4.5374, "loss/crossentropy": 2.0371533632278442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2355760782957077, "step": 6342 }, { "epoch": 0.12688, "grad_norm": 2.203125, "grad_norm_var": 0.0130523681640625, "learning_rate": 0.0001, "loss": 4.4849, "loss/crossentropy": 2.3101218938827515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24161705374717712, "step": 6344 }, { "epoch": 0.12692, "grad_norm": 2.25, "grad_norm_var": 0.010261027018229167, "learning_rate": 0.0001, "loss": 4.6399, "loss/crossentropy": 2.1990097761154175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2537754699587822, "step": 6346 }, { "epoch": 0.12696, "grad_norm": 2.125, "grad_norm_var": 0.0136383056640625, "learning_rate": 0.0001, "loss": 4.2697, "loss/crossentropy": 2.1783597469329834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106732353568077, "step": 6348 }, { "epoch": 0.127, "grad_norm": 2.359375, "grad_norm_var": 0.012987263997395833, "learning_rate": 0.0001, "loss": 4.5127, "loss/crossentropy": 2.2316598892211914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23841773718595505, "step": 6350 }, { "epoch": 0.12704, "grad_norm": 2.265625, "grad_norm_var": 0.00943603515625, "learning_rate": 0.0001, "loss": 4.5651, "loss/crossentropy": 2.0329924821853638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22755083441734314, "step": 6352 }, { "epoch": 0.12708, "grad_norm": 2.078125, "grad_norm_var": 0.014525349934895833, "learning_rate": 0.0001, "loss": 4.1888, "loss/crossentropy": 1.9174052476882935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20000722259283066, "step": 6354 }, { "epoch": 0.12712, "grad_norm": 2.15625, "grad_norm_var": 0.0154693603515625, "learning_rate": 0.0001, "loss": 4.3321, "loss/crossentropy": 1.968774676322937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2194407731294632, "step": 6356 }, { "epoch": 0.12716, "grad_norm": 2.296875, "grad_norm_var": 0.0183013916015625, "learning_rate": 0.0001, "loss": 4.5768, "loss/crossentropy": 2.1767213344573975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24483858048915863, "step": 6358 }, { "epoch": 0.1272, "grad_norm": 2.171875, "grad_norm_var": 0.018391927083333332, "learning_rate": 0.0001, "loss": 4.5063, "loss/crossentropy": 1.8390987515449524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21677500754594803, "step": 6360 }, { "epoch": 0.12724, "grad_norm": 2.203125, "grad_norm_var": 0.016039021809895835, "learning_rate": 0.0001, "loss": 4.2635, "loss/crossentropy": 2.1923086643218994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21362057328224182, "step": 6362 }, { "epoch": 0.12728, "grad_norm": 2.25, "grad_norm_var": 0.010347493489583333, "learning_rate": 0.0001, "loss": 4.3104, "loss/crossentropy": 2.030683994293213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22884567826986313, "step": 6364 }, { "epoch": 0.12732, "grad_norm": 2.203125, "grad_norm_var": 0.00816650390625, "learning_rate": 0.0001, "loss": 4.452, "loss/crossentropy": 2.330837845802307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24233026057481766, "step": 6366 }, { "epoch": 0.12736, "grad_norm": 2.453125, "grad_norm_var": 0.012040201822916667, "learning_rate": 0.0001, "loss": 4.6404, "loss/crossentropy": 2.001612663269043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2332654967904091, "step": 6368 }, { "epoch": 0.1274, "grad_norm": 2.25, "grad_norm_var": 0.011750284830729167, "learning_rate": 0.0001, "loss": 4.7101, "loss/crossentropy": 1.9234941601753235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23741928488016129, "step": 6370 }, { "epoch": 0.12744, "grad_norm": 2.078125, "grad_norm_var": 0.012262980143229166, "learning_rate": 0.0001, "loss": 4.2164, "loss/crossentropy": 2.059934139251709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24736596643924713, "step": 6372 }, { "epoch": 0.12748, "grad_norm": 2.125, "grad_norm_var": 0.010798136393229166, "learning_rate": 0.0001, "loss": 4.3916, "loss/crossentropy": 2.25182843208313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24349220097064972, "step": 6374 }, { "epoch": 0.12752, "grad_norm": 2.15625, "grad_norm_var": 0.010888671875, "learning_rate": 0.0001, "loss": 4.3742, "loss/crossentropy": 1.845405638217926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138313353061676, "step": 6376 }, { "epoch": 0.12756, "grad_norm": 2.546875, "grad_norm_var": 0.0164215087890625, "learning_rate": 0.0001, "loss": 4.568, "loss/crossentropy": 1.8833998441696167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25304871797561646, "step": 6378 }, { "epoch": 0.1276, "grad_norm": 2.28125, "grad_norm_var": 0.017072550455729165, "learning_rate": 0.0001, "loss": 4.2282, "loss/crossentropy": 2.1066314578056335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2231372445821762, "step": 6380 }, { "epoch": 0.12764, "grad_norm": 2.328125, "grad_norm_var": 0.018561808268229167, "learning_rate": 0.0001, "loss": 4.3381, "loss/crossentropy": 2.024670898914337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22260528802871704, "step": 6382 }, { "epoch": 0.12768, "grad_norm": 2.109375, "grad_norm_var": 0.01763916015625, "learning_rate": 0.0001, "loss": 4.8132, "loss/crossentropy": 2.425115466117859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2429298236966133, "step": 6384 }, { "epoch": 0.12772, "grad_norm": 2.109375, "grad_norm_var": 0.016597493489583334, "learning_rate": 0.0001, "loss": 4.2387, "loss/crossentropy": 2.1847925186157227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2495381161570549, "step": 6386 }, { "epoch": 0.12776, "grad_norm": 2.265625, "grad_norm_var": 0.015746053059895834, "learning_rate": 0.0001, "loss": 4.6919, "loss/crossentropy": 2.1463611125946045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2303348332643509, "step": 6388 }, { "epoch": 0.1278, "grad_norm": 2.171875, "grad_norm_var": 0.014839680989583333, "learning_rate": 0.0001, "loss": 4.2714, "loss/crossentropy": 1.9755331873893738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232731431722641, "step": 6390 }, { "epoch": 0.12784, "grad_norm": 2.078125, "grad_norm_var": 0.015973917643229165, "learning_rate": 0.0001, "loss": 4.5255, "loss/crossentropy": 2.0741465091705322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22912710905075073, "step": 6392 }, { "epoch": 0.12788, "grad_norm": 2.203125, "grad_norm_var": 0.009471638997395834, "learning_rate": 0.0001, "loss": 4.3834, "loss/crossentropy": 2.1314439177513123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22910036891698837, "step": 6394 }, { "epoch": 0.12792, "grad_norm": 2.109375, "grad_norm_var": 0.0107086181640625, "learning_rate": 0.0001, "loss": 4.1615, "loss/crossentropy": 1.976862907409668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.233707495033741, "step": 6396 }, { "epoch": 0.12796, "grad_norm": 2.171875, "grad_norm_var": 0.010807291666666666, "learning_rate": 0.0001, "loss": 4.3939, "loss/crossentropy": 2.0534666180610657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21424879133701324, "step": 6398 }, { "epoch": 0.128, "grad_norm": 2.109375, "grad_norm_var": 0.008088175455729167, "learning_rate": 0.0001, "loss": 4.6647, "loss/crossentropy": 2.2200992107391357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2836414724588394, "step": 6400 }, { "epoch": 0.12804, "grad_norm": 2.234375, "grad_norm_var": 0.007542928059895833, "learning_rate": 0.0001, "loss": 4.3802, "loss/crossentropy": 1.8642286658287048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21438255906105042, "step": 6402 }, { "epoch": 0.12808, "grad_norm": 2.15625, "grad_norm_var": 0.005464680989583333, "learning_rate": 0.0001, "loss": 4.3868, "loss/crossentropy": 1.9571613073349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22038634121418, "step": 6404 }, { "epoch": 0.12812, "grad_norm": 2.125, "grad_norm_var": 0.008980305989583333, "learning_rate": 0.0001, "loss": 4.6888, "loss/crossentropy": 2.0420787930488586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532814294099808, "step": 6406 }, { "epoch": 0.12816, "grad_norm": 2.109375, "grad_norm_var": 0.008349609375, "learning_rate": 0.0001, "loss": 4.3357, "loss/crossentropy": 1.7511736750602722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067284658551216, "step": 6408 }, { "epoch": 0.1282, "grad_norm": 2.21875, "grad_norm_var": 0.0095123291015625, "learning_rate": 0.0001, "loss": 4.5568, "loss/crossentropy": 2.0310762524604797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2763645648956299, "step": 6410 }, { "epoch": 0.12824, "grad_norm": 2.28125, "grad_norm_var": 0.008617146809895834, "learning_rate": 0.0001, "loss": 4.575, "loss/crossentropy": 2.112699866294861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2409205138683319, "step": 6412 }, { "epoch": 0.12828, "grad_norm": 2.296875, "grad_norm_var": 0.14166259765625, "learning_rate": 0.0001, "loss": 4.3023, "loss/crossentropy": 1.8243904113769531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2079915553331375, "step": 6414 }, { "epoch": 0.12832, "grad_norm": 2.125, "grad_norm_var": 0.14143473307291668, "learning_rate": 0.0001, "loss": 4.5395, "loss/crossentropy": 1.8310211896896362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21304991096258163, "step": 6416 }, { "epoch": 0.12836, "grad_norm": 2.140625, "grad_norm_var": 0.14011128743489584, "learning_rate": 0.0001, "loss": 4.3806, "loss/crossentropy": 1.9653990268707275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22207710891962051, "step": 6418 }, { "epoch": 0.1284, "grad_norm": 2.15625, "grad_norm_var": 0.13982645670572916, "learning_rate": 0.0001, "loss": 4.5869, "loss/crossentropy": 2.0583431124687195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23768695443868637, "step": 6420 }, { "epoch": 0.12844, "grad_norm": 2.296875, "grad_norm_var": 0.1391998291015625, "learning_rate": 0.0001, "loss": 4.6512, "loss/crossentropy": 2.1162279844284058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23963302373886108, "step": 6422 }, { "epoch": 0.12848, "grad_norm": 2.28125, "grad_norm_var": 0.13347066243489583, "learning_rate": 0.0001, "loss": 4.6268, "loss/crossentropy": 2.068794012069702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21967273205518723, "step": 6424 }, { "epoch": 0.12852, "grad_norm": 2.109375, "grad_norm_var": 0.13585611979166667, "learning_rate": 0.0001, "loss": 4.4004, "loss/crossentropy": 2.165997266769409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22991405427455902, "step": 6426 }, { "epoch": 0.12856, "grad_norm": 2.25, "grad_norm_var": 0.14031473795572916, "learning_rate": 0.0001, "loss": 4.1816, "loss/crossentropy": 1.9116491675376892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22784583270549774, "step": 6428 }, { "epoch": 0.1286, "grad_norm": 2.296875, "grad_norm_var": 0.0087890625, "learning_rate": 0.0001, "loss": 4.5268, "loss/crossentropy": 2.296347498893738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2618508040904999, "step": 6430 }, { "epoch": 0.12864, "grad_norm": 2.125, "grad_norm_var": 0.0100250244140625, "learning_rate": 0.0001, "loss": 4.0363, "loss/crossentropy": 2.3961373567581177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27667778730392456, "step": 6432 }, { "epoch": 0.12868, "grad_norm": 2.171875, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.3993, "loss/crossentropy": 1.9141033291816711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22818633913993835, "step": 6434 }, { "epoch": 0.12872, "grad_norm": 2.15625, "grad_norm_var": 0.0070149739583333336, "learning_rate": 0.0001, "loss": 4.481, "loss/crossentropy": 2.4431718587875366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24852856248617172, "step": 6436 }, { "epoch": 0.12876, "grad_norm": 2.171875, "grad_norm_var": 0.006494140625, "learning_rate": 0.0001, "loss": 4.5153, "loss/crossentropy": 1.9343949556350708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22181283682584763, "step": 6438 }, { "epoch": 0.1288, "grad_norm": 2.171875, "grad_norm_var": 0.005549112955729167, "learning_rate": 0.0001, "loss": 4.4547, "loss/crossentropy": 2.203734040260315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23536919057369232, "step": 6440 }, { "epoch": 0.12884, "grad_norm": 2.046875, "grad_norm_var": 0.0055572509765625, "learning_rate": 0.0001, "loss": 4.354, "loss/crossentropy": 2.0419046878814697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121758982539177, "step": 6442 }, { "epoch": 0.12888, "grad_norm": 2.328125, "grad_norm_var": 0.0054595947265625, "learning_rate": 0.0001, "loss": 4.5738, "loss/crossentropy": 2.3551554679870605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23853591084480286, "step": 6444 }, { "epoch": 0.12892, "grad_norm": 2.21875, "grad_norm_var": 0.004813639322916666, "learning_rate": 0.0001, "loss": 4.4334, "loss/crossentropy": 1.9079387784004211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22546496242284775, "step": 6446 }, { "epoch": 0.12896, "grad_norm": 2.140625, "grad_norm_var": 0.00390625, "learning_rate": 0.0001, "loss": 4.4188, "loss/crossentropy": 1.9344156980514526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21923892199993134, "step": 6448 }, { "epoch": 0.129, "grad_norm": 2.296875, "grad_norm_var": 0.0047271728515625, "learning_rate": 0.0001, "loss": 4.3219, "loss/crossentropy": 1.7627189755439758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20726975798606873, "step": 6450 }, { "epoch": 0.12904, "grad_norm": 2.28125, "grad_norm_var": 0.0048736572265625, "learning_rate": 0.0001, "loss": 4.4045, "loss/crossentropy": 1.9988782405853271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23156572133302689, "step": 6452 }, { "epoch": 0.12908, "grad_norm": 2.1875, "grad_norm_var": 0.004621378580729167, "learning_rate": 0.0001, "loss": 4.6551, "loss/crossentropy": 2.3970296382904053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24723558872938156, "step": 6454 }, { "epoch": 0.12912, "grad_norm": 2.234375, "grad_norm_var": 0.00455322265625, "learning_rate": 0.0001, "loss": 4.3785, "loss/crossentropy": 2.009088099002838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2188432812690735, "step": 6456 }, { "epoch": 0.12916, "grad_norm": 2.203125, "grad_norm_var": 0.005882771809895834, "learning_rate": 0.0001, "loss": 4.9189, "loss/crossentropy": 2.166573464870453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22506655752658844, "step": 6458 }, { "epoch": 0.1292, "grad_norm": 2.109375, "grad_norm_var": 0.00670166015625, "learning_rate": 0.0001, "loss": 4.3281, "loss/crossentropy": 2.1567386388778687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23143760859966278, "step": 6460 }, { "epoch": 0.12924, "grad_norm": 2.203125, "grad_norm_var": 0.007307942708333333, "learning_rate": 0.0001, "loss": 3.9726, "loss/crossentropy": 1.8458901643753052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22627578675746918, "step": 6462 }, { "epoch": 0.12928, "grad_norm": 2.21875, "grad_norm_var": 0.00738525390625, "learning_rate": 0.0001, "loss": 4.1416, "loss/crossentropy": 1.7933887243270874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944974958896637, "step": 6464 }, { "epoch": 0.12932, "grad_norm": 2.328125, "grad_norm_var": 0.0077707926432291664, "learning_rate": 0.0001, "loss": 4.3712, "loss/crossentropy": 2.14457631111145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2461908757686615, "step": 6466 }, { "epoch": 0.12936, "grad_norm": 1.9921875, "grad_norm_var": 0.011563873291015625, "learning_rate": 0.0001, "loss": 4.2253, "loss/crossentropy": 2.2191081047058105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24845656007528305, "step": 6468 }, { "epoch": 0.1294, "grad_norm": 2.3125, "grad_norm_var": 0.012284088134765624, "learning_rate": 0.0001, "loss": 4.3992, "loss/crossentropy": 2.2655181884765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24961909651756287, "step": 6470 }, { "epoch": 0.12944, "grad_norm": 2.171875, "grad_norm_var": 0.012617746988932291, "learning_rate": 0.0001, "loss": 4.2431, "loss/crossentropy": 1.9992872476577759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21180924773216248, "step": 6472 }, { "epoch": 0.12948, "grad_norm": 2.078125, "grad_norm_var": 0.009348297119140625, "learning_rate": 0.0001, "loss": 4.2865, "loss/crossentropy": 2.1950103044509888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23401658236980438, "step": 6474 }, { "epoch": 0.12952, "grad_norm": 2.234375, "grad_norm_var": 0.010027821858723958, "learning_rate": 0.0001, "loss": 4.2614, "loss/crossentropy": 2.148743689060211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146785706281662, "step": 6476 }, { "epoch": 0.12956, "grad_norm": 2.140625, "grad_norm_var": 0.010253651936848959, "learning_rate": 0.0001, "loss": 4.7196, "loss/crossentropy": 2.4312193393707275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2596089243888855, "step": 6478 }, { "epoch": 0.1296, "grad_norm": 2.25, "grad_norm_var": 0.011844635009765625, "learning_rate": 0.0001, "loss": 4.7232, "loss/crossentropy": 2.061104893684387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24564718455076218, "step": 6480 }, { "epoch": 0.12964, "grad_norm": 2.21875, "grad_norm_var": 0.011224110921223959, "learning_rate": 0.0001, "loss": 4.6833, "loss/crossentropy": 2.1994687914848328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2352369725704193, "step": 6482 }, { "epoch": 0.12968, "grad_norm": 2.203125, "grad_norm_var": 0.010692342122395834, "learning_rate": 0.0001, "loss": 4.2378, "loss/crossentropy": 2.085163116455078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2309635877609253, "step": 6484 }, { "epoch": 0.12972, "grad_norm": 2.1875, "grad_norm_var": 0.0098297119140625, "learning_rate": 0.0001, "loss": 4.5964, "loss/crossentropy": 2.3360198736190796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24846713244915009, "step": 6486 }, { "epoch": 0.12976, "grad_norm": 2.265625, "grad_norm_var": 0.010602823893229167, "learning_rate": 0.0001, "loss": 4.4454, "loss/crossentropy": 2.089003086090088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24980145692825317, "step": 6488 }, { "epoch": 0.1298, "grad_norm": 2.125, "grad_norm_var": 0.0108795166015625, "learning_rate": 0.0001, "loss": 4.4446, "loss/crossentropy": 2.0884299874305725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23875930905342102, "step": 6490 }, { "epoch": 0.12984, "grad_norm": 2.09375, "grad_norm_var": 0.01002197265625, "learning_rate": 0.0001, "loss": 4.1661, "loss/crossentropy": 1.9070702195167542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21942836046218872, "step": 6492 }, { "epoch": 0.12988, "grad_norm": 2.609375, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 4.6558, "loss/crossentropy": 2.1617711782455444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22820374369621277, "step": 6494 }, { "epoch": 0.12992, "grad_norm": 2.15625, "grad_norm_var": 0.0203765869140625, "learning_rate": 0.0001, "loss": 4.0938, "loss/crossentropy": 1.7782898545265198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22162485867738724, "step": 6496 }, { "epoch": 0.12996, "grad_norm": 2.1875, "grad_norm_var": 0.021028645833333335, "learning_rate": 0.0001, "loss": 4.4198, "loss/crossentropy": 2.1364612579345703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22490044683218002, "step": 6498 }, { "epoch": 0.13, "grad_norm": 2.34375, "grad_norm_var": 0.020213826497395834, "learning_rate": 0.0001, "loss": 4.569, "loss/crossentropy": 2.282773971557617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24553906172513962, "step": 6500 }, { "epoch": 0.13004, "grad_norm": 2.171875, "grad_norm_var": 0.020905558268229166, "learning_rate": 0.0001, "loss": 4.4965, "loss/crossentropy": 2.008001983165741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22152648121118546, "step": 6502 }, { "epoch": 0.13008, "grad_norm": 2.328125, "grad_norm_var": 0.021833292643229165, "learning_rate": 0.0001, "loss": 4.4535, "loss/crossentropy": 2.1681981086730957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26258186995983124, "step": 6504 }, { "epoch": 0.13012, "grad_norm": 2.328125, "grad_norm_var": 0.021761067708333335, "learning_rate": 0.0001, "loss": 4.6254, "loss/crossentropy": 2.5754435062408447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26315446197986603, "step": 6506 }, { "epoch": 0.13016, "grad_norm": 2.234375, "grad_norm_var": 0.021187337239583333, "learning_rate": 0.0001, "loss": 4.1505, "loss/crossentropy": 1.7897658348083496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22884277999401093, "step": 6508 }, { "epoch": 0.1302, "grad_norm": 2.46875, "grad_norm_var": 0.016844685872395834, "learning_rate": 0.0001, "loss": 4.6315, "loss/crossentropy": 1.8081435561180115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21860769391059875, "step": 6510 }, { "epoch": 0.13024, "grad_norm": 2.359375, "grad_norm_var": 0.0140289306640625, "learning_rate": 0.0001, "loss": 4.7371, "loss/crossentropy": 2.243759036064148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545628473162651, "step": 6512 }, { "epoch": 0.13028, "grad_norm": 2.1875, "grad_norm_var": 0.03447265625, "learning_rate": 0.0001, "loss": 4.6147, "loss/crossentropy": 2.069986939430237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20862630754709244, "step": 6514 }, { "epoch": 0.13032, "grad_norm": 2.203125, "grad_norm_var": 0.0349761962890625, "learning_rate": 0.0001, "loss": 4.4758, "loss/crossentropy": 2.052341878414154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460884153842926, "step": 6516 }, { "epoch": 0.13036, "grad_norm": 2.390625, "grad_norm_var": 0.0317047119140625, "learning_rate": 0.0001, "loss": 4.6369, "loss/crossentropy": 2.3376708030700684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25382688641548157, "step": 6518 }, { "epoch": 0.1304, "grad_norm": 2.125, "grad_norm_var": 0.0329498291015625, "learning_rate": 0.0001, "loss": 4.3048, "loss/crossentropy": 1.8329171538352966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2135135680437088, "step": 6520 }, { "epoch": 0.13044, "grad_norm": 2.421875, "grad_norm_var": 0.035791015625, "learning_rate": 0.0001, "loss": 4.6582, "loss/crossentropy": 2.077217698097229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265045866370201, "step": 6522 }, { "epoch": 0.13048, "grad_norm": 2.171875, "grad_norm_var": 0.036783854166666664, "learning_rate": 0.0001, "loss": 4.2189, "loss/crossentropy": 2.0677568912506104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2468869537115097, "step": 6524 }, { "epoch": 0.13052, "grad_norm": 2.296875, "grad_norm_var": 0.03591206868489583, "learning_rate": 0.0001, "loss": 4.5246, "loss/crossentropy": 2.0692074298858643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2602302059531212, "step": 6526 }, { "epoch": 0.13056, "grad_norm": 2.421875, "grad_norm_var": 0.03583984375, "learning_rate": 0.0001, "loss": 4.6709, "loss/crossentropy": 1.9767250418663025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232941836118698, "step": 6528 }, { "epoch": 0.1306, "grad_norm": 2.234375, "grad_norm_var": 0.01480712890625, "learning_rate": 0.0001, "loss": 4.4851, "loss/crossentropy": 1.7538996934890747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20786649733781815, "step": 6530 }, { "epoch": 0.13064, "grad_norm": 2.203125, "grad_norm_var": 0.01578369140625, "learning_rate": 0.0001, "loss": 4.5598, "loss/crossentropy": 2.061249256134033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23238955438137054, "step": 6532 }, { "epoch": 0.13068, "grad_norm": 2.171875, "grad_norm_var": 0.0167877197265625, "learning_rate": 0.0001, "loss": 4.237, "loss/crossentropy": 1.9083253145217896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22536182403564453, "step": 6534 }, { "epoch": 0.13072, "grad_norm": 2.046875, "grad_norm_var": 0.017952473958333333, "learning_rate": 0.0001, "loss": 4.3004, "loss/crossentropy": 2.052153766155243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2343958392739296, "step": 6536 }, { "epoch": 0.13076, "grad_norm": 2.125, "grad_norm_var": 0.010091145833333334, "learning_rate": 0.0001, "loss": 4.3638, "loss/crossentropy": 1.9979270100593567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20102836191654205, "step": 6538 }, { "epoch": 0.1308, "grad_norm": 1.9921875, "grad_norm_var": 0.012031809488932291, "learning_rate": 0.0001, "loss": 4.019, "loss/crossentropy": 1.618333637714386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21146921813488007, "step": 6540 }, { "epoch": 0.13084, "grad_norm": 2.078125, "grad_norm_var": 0.011940256754557291, "learning_rate": 0.0001, "loss": 4.2514, "loss/crossentropy": 1.771790623664856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21661869436502457, "step": 6542 }, { "epoch": 0.13088, "grad_norm": 2.25, "grad_norm_var": 0.007165273030598958, "learning_rate": 0.0001, "loss": 4.3528, "loss/crossentropy": 2.2347733974456787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2507361173629761, "step": 6544 }, { "epoch": 0.13092, "grad_norm": 2.328125, "grad_norm_var": 0.012389882405598959, "learning_rate": 0.0001, "loss": 4.4333, "loss/crossentropy": 2.226397395133972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.257301464676857, "step": 6546 }, { "epoch": 0.13096, "grad_norm": 2.125, "grad_norm_var": 0.012776438395182292, "learning_rate": 0.0001, "loss": 4.1629, "loss/crossentropy": 1.9884281158447266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2213538959622383, "step": 6548 }, { "epoch": 0.131, "grad_norm": 2.0625, "grad_norm_var": 0.01755549112955729, "learning_rate": 0.0001, "loss": 4.549, "loss/crossentropy": 2.0790398120880127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2259911745786667, "step": 6550 }, { "epoch": 0.13104, "grad_norm": 2.28125, "grad_norm_var": 0.016294097900390624, "learning_rate": 0.0001, "loss": 4.5118, "loss/crossentropy": 2.1309107542037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23984672129154205, "step": 6552 }, { "epoch": 0.13108, "grad_norm": 2.296875, "grad_norm_var": 0.01744562784830729, "learning_rate": 0.0001, "loss": 4.4528, "loss/crossentropy": 1.9651963114738464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21699358522891998, "step": 6554 }, { "epoch": 0.13112, "grad_norm": 2.21875, "grad_norm_var": 0.014598592122395834, "learning_rate": 0.0001, "loss": 4.436, "loss/crossentropy": 2.405247449874878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24624405801296234, "step": 6556 }, { "epoch": 0.13116, "grad_norm": 2.125, "grad_norm_var": 0.0154296875, "learning_rate": 0.0001, "loss": 4.3984, "loss/crossentropy": 2.215611457824707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24835016578435898, "step": 6558 }, { "epoch": 0.1312, "grad_norm": 4.125, "grad_norm_var": 0.2436920166015625, "learning_rate": 0.0001, "loss": 4.3876, "loss/crossentropy": 2.1731653809547424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24633550643920898, "step": 6560 }, { "epoch": 0.13124, "grad_norm": 2.234375, "grad_norm_var": 0.24501953125, "learning_rate": 0.0001, "loss": 4.5101, "loss/crossentropy": 2.0507587790489197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25702129304409027, "step": 6562 }, { "epoch": 0.13128, "grad_norm": 2.375, "grad_norm_var": 0.2433502197265625, "learning_rate": 0.0001, "loss": 4.3464, "loss/crossentropy": 2.2088446617126465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22610870003700256, "step": 6564 }, { "epoch": 0.13132, "grad_norm": 2.34375, "grad_norm_var": 0.23931884765625, "learning_rate": 0.0001, "loss": 4.3612, "loss/crossentropy": 1.6911352276802063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2188590243458748, "step": 6566 }, { "epoch": 0.13136, "grad_norm": 2.09375, "grad_norm_var": 0.24976806640625, "learning_rate": 0.0001, "loss": 3.9795, "loss/crossentropy": 1.807699978351593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21844930946826935, "step": 6568 }, { "epoch": 0.1314, "grad_norm": 2.28125, "grad_norm_var": 0.24514872233072918, "learning_rate": 0.0001, "loss": 4.4293, "loss/crossentropy": 2.292602300643921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.242776520550251, "step": 6570 }, { "epoch": 0.13144, "grad_norm": 2.03125, "grad_norm_var": 0.24806315104166668, "learning_rate": 0.0001, "loss": 4.07, "loss/crossentropy": 1.5262329578399658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1674257293343544, "step": 6572 }, { "epoch": 0.13148, "grad_norm": 2.15625, "grad_norm_var": 0.24504801432291667, "learning_rate": 0.0001, "loss": 4.4084, "loss/crossentropy": 2.180319309234619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2260192409157753, "step": 6574 }, { "epoch": 0.13152, "grad_norm": 2.265625, "grad_norm_var": 0.0216705322265625, "learning_rate": 0.0001, "loss": 4.544, "loss/crossentropy": 2.188440203666687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23389383405447006, "step": 6576 }, { "epoch": 0.13156, "grad_norm": 2.125, "grad_norm_var": 0.03427327473958333, "learning_rate": 0.0001, "loss": 4.7595, "loss/crossentropy": 2.20253586769104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3217846751213074, "step": 6578 }, { "epoch": 0.1316, "grad_norm": 2.359375, "grad_norm_var": 0.034764607747395836, "learning_rate": 0.0001, "loss": 4.2771, "loss/crossentropy": 2.1901479959487915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24521923065185547, "step": 6580 }, { "epoch": 0.13164, "grad_norm": 2.6875, "grad_norm_var": 0.04170633951822917, "learning_rate": 0.0001, "loss": 4.7105, "loss/crossentropy": 2.0719348192214966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2640562355518341, "step": 6582 }, { "epoch": 0.13168, "grad_norm": 2.1875, "grad_norm_var": 0.040339152018229164, "learning_rate": 0.0001, "loss": 4.1487, "loss/crossentropy": 2.331762194633484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24890532344579697, "step": 6584 }, { "epoch": 0.13172, "grad_norm": 2.34375, "grad_norm_var": 0.04429931640625, "learning_rate": 0.0001, "loss": 4.6013, "loss/crossentropy": 1.5143779516220093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1957392692565918, "step": 6586 }, { "epoch": 0.13176, "grad_norm": 2.171875, "grad_norm_var": 0.04088134765625, "learning_rate": 0.0001, "loss": 4.4311, "loss/crossentropy": 2.301910698413849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27353671938180923, "step": 6588 }, { "epoch": 0.1318, "grad_norm": 2.84375, "grad_norm_var": 0.11796773274739583, "learning_rate": 0.0001, "loss": 4.5035, "loss/crossentropy": 2.018579602241516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24261966347694397, "step": 6590 }, { "epoch": 0.13184, "grad_norm": 2.203125, "grad_norm_var": 0.11685282389322917, "learning_rate": 0.0001, "loss": 4.6931, "loss/crossentropy": 1.9749983549118042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20727626234292984, "step": 6592 }, { "epoch": 0.13188, "grad_norm": 2.0, "grad_norm_var": 0.11741536458333333, "learning_rate": 0.0001, "loss": 4.1929, "loss/crossentropy": 2.040414035320282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22879169881343842, "step": 6594 }, { "epoch": 0.13192, "grad_norm": 2.453125, "grad_norm_var": 0.11551106770833333, "learning_rate": 0.0001, "loss": 4.3331, "loss/crossentropy": 1.941766619682312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22591909021139145, "step": 6596 }, { "epoch": 0.13196, "grad_norm": 2.34375, "grad_norm_var": 0.10946858723958333, "learning_rate": 0.0001, "loss": 4.3659, "loss/crossentropy": 1.8487200140953064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23654203116893768, "step": 6598 }, { "epoch": 0.132, "grad_norm": 2.109375, "grad_norm_var": 0.10340067545572916, "learning_rate": 0.0001, "loss": 4.225, "loss/crossentropy": 1.9297338724136353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016594037413597, "step": 6600 }, { "epoch": 0.13204, "grad_norm": 2.046875, "grad_norm_var": 0.11005757649739584, "learning_rate": 0.0001, "loss": 4.3643, "loss/crossentropy": 2.048487663269043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20044995844364166, "step": 6602 }, { "epoch": 0.13208, "grad_norm": 2.0625, "grad_norm_var": 0.11295572916666667, "learning_rate": 0.0001, "loss": 4.425, "loss/crossentropy": 2.3620439767837524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24750277400016785, "step": 6604 }, { "epoch": 0.13212, "grad_norm": 2.328125, "grad_norm_var": 0.0158203125, "learning_rate": 0.0001, "loss": 4.2578, "loss/crossentropy": 2.1324113607406616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23109012842178345, "step": 6606 }, { "epoch": 0.13216, "grad_norm": 2.1875, "grad_norm_var": 0.015620930989583334, "learning_rate": 0.0001, "loss": 4.4775, "loss/crossentropy": 2.021477997303009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152409330010414, "step": 6608 }, { "epoch": 0.1322, "grad_norm": 2.1875, "grad_norm_var": 0.012669881184895834, "learning_rate": 0.0001, "loss": 4.6221, "loss/crossentropy": 2.1693036556243896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.234656922519207, "step": 6610 }, { "epoch": 0.13224, "grad_norm": 2.203125, "grad_norm_var": 0.008854166666666666, "learning_rate": 0.0001, "loss": 4.1469, "loss/crossentropy": 2.1138017177581787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239084094762802, "step": 6612 }, { "epoch": 0.13228, "grad_norm": 2.28125, "grad_norm_var": 0.008772786458333333, "learning_rate": 0.0001, "loss": 4.4862, "loss/crossentropy": 2.2540252208709717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2614182382822037, "step": 6614 }, { "epoch": 0.13232, "grad_norm": 2.078125, "grad_norm_var": 0.008837890625, "learning_rate": 0.0001, "loss": 4.2131, "loss/crossentropy": 2.033502757549286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2177339717745781, "step": 6616 }, { "epoch": 0.13236, "grad_norm": 2.078125, "grad_norm_var": 0.010054524739583333, "learning_rate": 0.0001, "loss": 4.2648, "loss/crossentropy": 2.2490646839141846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22369390726089478, "step": 6618 }, { "epoch": 0.1324, "grad_norm": 2.21875, "grad_norm_var": 0.0150543212890625, "learning_rate": 0.0001, "loss": 4.3571, "loss/crossentropy": 2.14319908618927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24394190311431885, "step": 6620 }, { "epoch": 0.13244, "grad_norm": 2.390625, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 4.5136, "loss/crossentropy": 2.045474410057068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22738997638225555, "step": 6622 }, { "epoch": 0.13248, "grad_norm": 2.203125, "grad_norm_var": 0.015816243489583333, "learning_rate": 0.0001, "loss": 4.5773, "loss/crossentropy": 2.1853290796279907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565506473183632, "step": 6624 }, { "epoch": 0.13252, "grad_norm": 2.078125, "grad_norm_var": 0.017015584309895835, "learning_rate": 0.0001, "loss": 4.275, "loss/crossentropy": 2.1161083579063416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22949891537427902, "step": 6626 }, { "epoch": 0.13256, "grad_norm": 2.1875, "grad_norm_var": 0.0167633056640625, "learning_rate": 0.0001, "loss": 4.3154, "loss/crossentropy": 2.167048454284668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23087909072637558, "step": 6628 }, { "epoch": 0.1326, "grad_norm": 2.21875, "grad_norm_var": 0.0156158447265625, "learning_rate": 0.0001, "loss": 4.2777, "loss/crossentropy": 2.1544610261917114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25153525173664093, "step": 6630 }, { "epoch": 0.13264, "grad_norm": 2.484375, "grad_norm_var": 0.018244425455729168, "learning_rate": 0.0001, "loss": 4.4778, "loss/crossentropy": 2.0319228768348694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22067170590162277, "step": 6632 }, { "epoch": 0.13268, "grad_norm": 2.140625, "grad_norm_var": 0.0162261962890625, "learning_rate": 0.0001, "loss": 4.4398, "loss/crossentropy": 1.9723476767539978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21380966901779175, "step": 6634 }, { "epoch": 0.13272, "grad_norm": 2.796875, "grad_norm_var": 0.031590779622395836, "learning_rate": 0.0001, "loss": 4.3474, "loss/crossentropy": 2.2833333015441895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23245477676391602, "step": 6636 }, { "epoch": 0.13276, "grad_norm": 2.28125, "grad_norm_var": 0.03132222493489583, "learning_rate": 0.0001, "loss": 4.4522, "loss/crossentropy": 2.13793683052063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22665268182754517, "step": 6638 }, { "epoch": 0.1328, "grad_norm": 2.734375, "grad_norm_var": 0.04592692057291667, "learning_rate": 0.0001, "loss": 4.6386, "loss/crossentropy": 2.188693881034851, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2471916377544403, "step": 6640 }, { "epoch": 0.13284, "grad_norm": 2.234375, "grad_norm_var": 0.0438629150390625, "learning_rate": 0.0001, "loss": 4.6394, "loss/crossentropy": 2.169856071472168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23427975177764893, "step": 6642 }, { "epoch": 0.13288, "grad_norm": 2.15625, "grad_norm_var": 0.04365132649739583, "learning_rate": 0.0001, "loss": 4.6498, "loss/crossentropy": 2.1600061655044556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23366540670394897, "step": 6644 }, { "epoch": 0.13292, "grad_norm": 2.359375, "grad_norm_var": 0.044188435872395834, "learning_rate": 0.0001, "loss": 4.4521, "loss/crossentropy": 1.822945475578308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2203340008854866, "step": 6646 }, { "epoch": 0.13296, "grad_norm": 2.15625, "grad_norm_var": 0.045735677083333336, "learning_rate": 0.0001, "loss": 4.3263, "loss/crossentropy": 1.8908653259277344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2320382297039032, "step": 6648 }, { "epoch": 0.133, "grad_norm": 2.265625, "grad_norm_var": 0.04363606770833333, "learning_rate": 0.0001, "loss": 4.5065, "loss/crossentropy": 2.126000165939331, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24388836324214935, "step": 6650 }, { "epoch": 0.13304, "grad_norm": 2.328125, "grad_norm_var": 0.0242828369140625, "learning_rate": 0.0001, "loss": 4.5787, "loss/crossentropy": 2.434928297996521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24083568900823593, "step": 6652 }, { "epoch": 0.13308, "grad_norm": 2.203125, "grad_norm_var": 0.023824055989583332, "learning_rate": 0.0001, "loss": 4.5538, "loss/crossentropy": 2.2186567783355713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2220132276415825, "step": 6654 }, { "epoch": 0.13312, "grad_norm": 2.25, "grad_norm_var": 0.006257120768229167, "learning_rate": 0.0001, "loss": 4.4934, "loss/crossentropy": 1.849799931049347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21551364660263062, "step": 6656 }, { "epoch": 0.13316, "grad_norm": 2.109375, "grad_norm_var": 0.0072662353515625, "learning_rate": 0.0001, "loss": 4.2237, "loss/crossentropy": 2.082044243812561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21274320781230927, "step": 6658 }, { "epoch": 0.1332, "grad_norm": 2.125, "grad_norm_var": 0.010838826497395834, "learning_rate": 0.0001, "loss": 4.5884, "loss/crossentropy": 2.1957098245620728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23078418523073196, "step": 6660 }, { "epoch": 0.13324, "grad_norm": 2.28125, "grad_norm_var": 0.16033528645833334, "learning_rate": 0.0001, "loss": 4.519, "loss/crossentropy": 2.228309690952301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27178408950567245, "step": 6662 }, { "epoch": 0.13328, "grad_norm": 2.40625, "grad_norm_var": 0.156298828125, "learning_rate": 0.0001, "loss": 4.5987, "loss/crossentropy": 1.8185940384864807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2145048901438713, "step": 6664 }, { "epoch": 0.13332, "grad_norm": 2.171875, "grad_norm_var": 0.1566558837890625, "learning_rate": 0.0001, "loss": 4.4722, "loss/crossentropy": 2.198649048805237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262621968984604, "step": 6666 }, { "epoch": 0.13336, "grad_norm": 2.15625, "grad_norm_var": 0.159130859375, "learning_rate": 0.0001, "loss": 4.5729, "loss/crossentropy": 2.2075835466384888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23531068861484528, "step": 6668 }, { "epoch": 0.1334, "grad_norm": 2.140625, "grad_norm_var": 0.16243387858072916, "learning_rate": 0.0001, "loss": 4.2913, "loss/crossentropy": 1.9719768166542053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2206977903842926, "step": 6670 }, { "epoch": 0.13344, "grad_norm": 2.03125, "grad_norm_var": 0.17021484375, "learning_rate": 0.0001, "loss": 4.2144, "loss/crossentropy": 2.304553985595703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23418369889259338, "step": 6672 }, { "epoch": 0.13348, "grad_norm": 2.25, "grad_norm_var": 0.1682281494140625, "learning_rate": 0.0001, "loss": 4.4485, "loss/crossentropy": 2.212409734725952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23888318240642548, "step": 6674 }, { "epoch": 0.13352, "grad_norm": 2.0625, "grad_norm_var": 0.19371337890625, "learning_rate": 0.0001, "loss": 4.176, "loss/crossentropy": 2.001897156238556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147163525223732, "step": 6676 }, { "epoch": 0.13356, "grad_norm": 2.203125, "grad_norm_var": 0.043680826822916664, "learning_rate": 0.0001, "loss": 4.4245, "loss/crossentropy": 2.216760039329529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24913202226161957, "step": 6678 }, { "epoch": 0.1336, "grad_norm": 2.09375, "grad_norm_var": 0.04597142537434896, "learning_rate": 0.0001, "loss": 4.1862, "loss/crossentropy": 1.8190750479698181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.200186125934124, "step": 6680 }, { "epoch": 0.13364, "grad_norm": 2.140625, "grad_norm_var": 0.047548166910807294, "learning_rate": 0.0001, "loss": 4.7271, "loss/crossentropy": 2.311274528503418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22400956600904465, "step": 6682 }, { "epoch": 0.13368, "grad_norm": 2.34375, "grad_norm_var": 0.5293841044108073, "learning_rate": 0.0001, "loss": 4.3852, "loss/crossentropy": 2.0381893515586853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23101551085710526, "step": 6684 }, { "epoch": 0.13372, "grad_norm": 2.21875, "grad_norm_var": 0.5331776936848959, "learning_rate": 0.0001, "loss": 4.1185, "loss/crossentropy": 1.7441503405570984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20208899676799774, "step": 6686 }, { "epoch": 0.13376, "grad_norm": 2.171875, "grad_norm_var": 0.5252593994140625, "learning_rate": 0.0001, "loss": 4.3835, "loss/crossentropy": 1.8874938488006592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2255900353193283, "step": 6688 }, { "epoch": 0.1338, "grad_norm": 2.3125, "grad_norm_var": 0.5284006754557292, "learning_rate": 0.0001, "loss": 4.2563, "loss/crossentropy": 2.2768125534057617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21636968851089478, "step": 6690 }, { "epoch": 0.13384, "grad_norm": 2.09375, "grad_norm_var": 0.5063954671223958, "learning_rate": 0.0001, "loss": 4.278, "loss/crossentropy": 2.0253931283950806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28937554359436035, "step": 6692 }, { "epoch": 0.13388, "grad_norm": 2.28125, "grad_norm_var": 0.5079661051432292, "learning_rate": 0.0001, "loss": 4.4205, "loss/crossentropy": 2.1940718293190002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23384064435958862, "step": 6694 }, { "epoch": 0.13392, "grad_norm": 2.078125, "grad_norm_var": 0.4987993876139323, "learning_rate": 0.0001, "loss": 4.5066, "loss/crossentropy": 2.071534514427185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146512120962143, "step": 6696 }, { "epoch": 0.13396, "grad_norm": 2.28125, "grad_norm_var": 0.5017534891764323, "learning_rate": 0.0001, "loss": 4.348, "loss/crossentropy": 1.8530714511871338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21837462484836578, "step": 6698 }, { "epoch": 0.134, "grad_norm": 2.28125, "grad_norm_var": 0.013396962483723959, "learning_rate": 0.0001, "loss": 4.521, "loss/crossentropy": 2.210664451122284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2199932038784027, "step": 6700 }, { "epoch": 0.13404, "grad_norm": 2.171875, "grad_norm_var": 0.011506144205729167, "learning_rate": 0.0001, "loss": 4.4226, "loss/crossentropy": 1.8530223965644836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.209881991147995, "step": 6702 }, { "epoch": 0.13408, "grad_norm": 2.234375, "grad_norm_var": 0.011839803059895833, "learning_rate": 0.0001, "loss": 4.4302, "loss/crossentropy": 1.8609183430671692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22096765041351318, "step": 6704 }, { "epoch": 0.13412, "grad_norm": 2.59375, "grad_norm_var": 0.021214803059895832, "learning_rate": 0.0001, "loss": 4.8429, "loss/crossentropy": 2.33315110206604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26414938271045685, "step": 6706 }, { "epoch": 0.13416, "grad_norm": 2.359375, "grad_norm_var": 0.04798075358072917, "learning_rate": 0.0001, "loss": 4.6054, "loss/crossentropy": 2.2656116485595703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2769838646054268, "step": 6708 }, { "epoch": 0.1342, "grad_norm": 2.1875, "grad_norm_var": 0.0466705322265625, "learning_rate": 0.0001, "loss": 4.4875, "loss/crossentropy": 2.2131590843200684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23640615493059158, "step": 6710 }, { "epoch": 0.13424, "grad_norm": 2.25, "grad_norm_var": 0.044831339518229166, "learning_rate": 0.0001, "loss": 4.1554, "loss/crossentropy": 1.8667671084403992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.212738998234272, "step": 6712 }, { "epoch": 0.13428, "grad_norm": 2.203125, "grad_norm_var": 0.04480692545572917, "learning_rate": 0.0001, "loss": 4.0958, "loss/crossentropy": 1.9699830412864685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23507894575595856, "step": 6714 }, { "epoch": 0.13432, "grad_norm": 2.078125, "grad_norm_var": 0.04632161458333333, "learning_rate": 0.0001, "loss": 4.1571, "loss/crossentropy": 1.8108918070793152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192004919052124, "step": 6716 }, { "epoch": 0.13436, "grad_norm": 2.203125, "grad_norm_var": 0.042740885416666666, "learning_rate": 0.0001, "loss": 4.4983, "loss/crossentropy": 2.0528674125671387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22221814841032028, "step": 6718 }, { "epoch": 0.1344, "grad_norm": 2.046875, "grad_norm_var": 0.046305338541666664, "learning_rate": 0.0001, "loss": 4.2544, "loss/crossentropy": 1.7881956696510315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21330178529024124, "step": 6720 }, { "epoch": 0.13444, "grad_norm": 2.3125, "grad_norm_var": 0.039159138997395836, "learning_rate": 0.0001, "loss": 4.5424, "loss/crossentropy": 2.2016018629074097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22042546421289444, "step": 6722 }, { "epoch": 0.13448, "grad_norm": 2.171875, "grad_norm_var": 0.006322224934895833, "learning_rate": 0.0001, "loss": 4.3159, "loss/crossentropy": 1.9661846160888672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21542846411466599, "step": 6724 }, { "epoch": 0.13452, "grad_norm": 2.328125, "grad_norm_var": 0.031037394205729166, "learning_rate": 0.0001, "loss": 4.4473, "loss/crossentropy": 2.1073482036590576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23370730876922607, "step": 6726 }, { "epoch": 0.13456, "grad_norm": 2.25, "grad_norm_var": 0.030826822916666666, "learning_rate": 0.0001, "loss": 4.6069, "loss/crossentropy": 2.1937917470932007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22414422780275345, "step": 6728 }, { "epoch": 0.1346, "grad_norm": 2.109375, "grad_norm_var": 0.030436197916666668, "learning_rate": 0.0001, "loss": 4.2937, "loss/crossentropy": 2.078732967376709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21227504312992096, "step": 6730 }, { "epoch": 0.13464, "grad_norm": 2.34375, "grad_norm_var": 0.030301920572916665, "learning_rate": 0.0001, "loss": 4.3043, "loss/crossentropy": 1.9002525806427002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21004119515419006, "step": 6732 }, { "epoch": 0.13468, "grad_norm": 2.15625, "grad_norm_var": 0.030794270833333335, "learning_rate": 0.0001, "loss": 4.5156, "loss/crossentropy": 2.055518925189972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21610142290592194, "step": 6734 }, { "epoch": 0.13472, "grad_norm": 2.1875, "grad_norm_var": 0.028473917643229166, "learning_rate": 0.0001, "loss": 4.4889, "loss/crossentropy": 2.0521084666252136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23528365790843964, "step": 6736 }, { "epoch": 0.13476, "grad_norm": 2.203125, "grad_norm_var": 0.02730712890625, "learning_rate": 0.0001, "loss": 4.5051, "loss/crossentropy": 2.2536301612854004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21544227004051208, "step": 6738 }, { "epoch": 0.1348, "grad_norm": 2.1875, "grad_norm_var": 0.027179972330729166, "learning_rate": 0.0001, "loss": 4.5164, "loss/crossentropy": 2.2610143423080444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532905787229538, "step": 6740 }, { "epoch": 0.13484, "grad_norm": 2.109375, "grad_norm_var": 0.00836181640625, "learning_rate": 0.0001, "loss": 4.1301, "loss/crossentropy": 1.8840081095695496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21514790505170822, "step": 6742 }, { "epoch": 0.13488, "grad_norm": 2.109375, "grad_norm_var": 0.0084136962890625, "learning_rate": 0.0001, "loss": 4.2532, "loss/crossentropy": 2.0841002464294434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2328692153096199, "step": 6744 }, { "epoch": 0.13492, "grad_norm": 2.390625, "grad_norm_var": 0.010798136393229166, "learning_rate": 0.0001, "loss": 4.6335, "loss/crossentropy": 2.507196068763733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2736222892999649, "step": 6746 }, { "epoch": 0.13496, "grad_norm": 2.421875, "grad_norm_var": 0.016527303059895835, "learning_rate": 0.0001, "loss": 4.189, "loss/crossentropy": 2.0180357098579407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22328195720911026, "step": 6748 }, { "epoch": 0.135, "grad_norm": 2.15625, "grad_norm_var": 0.016097005208333334, "learning_rate": 0.0001, "loss": 4.2854, "loss/crossentropy": 2.2457560300827026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26572495698928833, "step": 6750 }, { "epoch": 0.13504, "grad_norm": 2.6875, "grad_norm_var": 0.031493123372395834, "learning_rate": 0.0001, "loss": 4.885, "loss/crossentropy": 2.1280853748321533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22716474533081055, "step": 6752 }, { "epoch": 0.13508, "grad_norm": 2.25, "grad_norm_var": 0.03216145833333333, "learning_rate": 0.0001, "loss": 4.3901, "loss/crossentropy": 2.1843650341033936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24630828201770782, "step": 6754 }, { "epoch": 0.13512, "grad_norm": 2.21875, "grad_norm_var": 0.03390299479166667, "learning_rate": 0.0001, "loss": 4.3584, "loss/crossentropy": 1.7955012917518616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20614011585712433, "step": 6756 }, { "epoch": 0.13516, "grad_norm": 2.1875, "grad_norm_var": 0.028563435872395834, "learning_rate": 0.0001, "loss": 4.3546, "loss/crossentropy": 1.9315852522850037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22918011993169785, "step": 6758 }, { "epoch": 0.1352, "grad_norm": 2.140625, "grad_norm_var": 0.028055826822916668, "learning_rate": 0.0001, "loss": 4.2659, "loss/crossentropy": 1.982887327671051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21635843813419342, "step": 6760 }, { "epoch": 0.13524, "grad_norm": 2.0625, "grad_norm_var": 0.029215494791666668, "learning_rate": 0.0001, "loss": 4.2828, "loss/crossentropy": 2.25021892786026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22931701689958572, "step": 6762 }, { "epoch": 0.13528, "grad_norm": 2.34375, "grad_norm_var": 0.02252197265625, "learning_rate": 0.0001, "loss": 4.5991, "loss/crossentropy": 2.5220746994018555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2406606376171112, "step": 6764 }, { "epoch": 0.13532, "grad_norm": 2.375, "grad_norm_var": 0.022386678059895835, "learning_rate": 0.0001, "loss": 4.5143, "loss/crossentropy": 1.8115127086639404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.220379076898098, "step": 6766 }, { "epoch": 0.13536, "grad_norm": 2.0625, "grad_norm_var": 0.010856119791666667, "learning_rate": 0.0001, "loss": 4.5113, "loss/crossentropy": 1.8998088240623474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22993376106023788, "step": 6768 }, { "epoch": 0.1354, "grad_norm": 2.09375, "grad_norm_var": 0.01051025390625, "learning_rate": 0.0001, "loss": 4.28, "loss/crossentropy": 2.0183660984039307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21991200000047684, "step": 6770 }, { "epoch": 0.13544, "grad_norm": 2.296875, "grad_norm_var": 0.009847005208333334, "learning_rate": 0.0001, "loss": 4.6224, "loss/crossentropy": 2.1927448511123657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536699026823044, "step": 6772 }, { "epoch": 0.13548, "grad_norm": 2.234375, "grad_norm_var": 0.014404296875, "learning_rate": 0.0001, "loss": 4.196, "loss/crossentropy": 1.92184317111969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20716708898544312, "step": 6774 }, { "epoch": 0.13552, "grad_norm": 2.296875, "grad_norm_var": 0.016813151041666665, "learning_rate": 0.0001, "loss": 4.7779, "loss/crossentropy": 2.2437468767166138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26381388306617737, "step": 6776 }, { "epoch": 0.13556, "grad_norm": 2.0625, "grad_norm_var": 0.016722615559895834, "learning_rate": 0.0001, "loss": 4.2907, "loss/crossentropy": 2.087414026260376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21566492319107056, "step": 6778 }, { "epoch": 0.1356, "grad_norm": 2.140625, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 4.4273, "loss/crossentropy": 2.1936367750167847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22492723166942596, "step": 6780 }, { "epoch": 0.13564, "grad_norm": 2.109375, "grad_norm_var": 0.014046223958333333, "learning_rate": 0.0001, "loss": 4.2992, "loss/crossentropy": 1.7642306685447693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20992937684059143, "step": 6782 }, { "epoch": 0.13568, "grad_norm": 2.140625, "grad_norm_var": 0.014371744791666667, "learning_rate": 0.0001, "loss": 4.3593, "loss/crossentropy": 2.01781964302063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22552715986967087, "step": 6784 }, { "epoch": 0.13572, "grad_norm": 2.140625, "grad_norm_var": 0.016402180989583334, "learning_rate": 0.0001, "loss": 4.4708, "loss/crossentropy": 2.0788158774375916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24181769788265228, "step": 6786 }, { "epoch": 0.13576, "grad_norm": 2.171875, "grad_norm_var": 0.01539306640625, "learning_rate": 0.0001, "loss": 4.2163, "loss/crossentropy": 2.0424017310142517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21705424785614014, "step": 6788 }, { "epoch": 0.1358, "grad_norm": 2.46875, "grad_norm_var": 0.019025675455729165, "learning_rate": 0.0001, "loss": 4.1182, "loss/crossentropy": 1.6175345182418823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1899409219622612, "step": 6790 }, { "epoch": 0.13584, "grad_norm": 2.3125, "grad_norm_var": 0.016852823893229167, "learning_rate": 0.0001, "loss": 4.2914, "loss/crossentropy": 2.004386007785797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22395263612270355, "step": 6792 }, { "epoch": 0.13588, "grad_norm": 2.109375, "grad_norm_var": 0.019245402018229166, "learning_rate": 0.0001, "loss": 4.2182, "loss/crossentropy": 1.9224132895469666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2326791137456894, "step": 6794 }, { "epoch": 0.13592, "grad_norm": 2.1875, "grad_norm_var": 0.019755045572916668, "learning_rate": 0.0001, "loss": 4.4768, "loss/crossentropy": 1.8331453204154968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21586360037326813, "step": 6796 }, { "epoch": 0.13596, "grad_norm": 2.1875, "grad_norm_var": 0.0197906494140625, "learning_rate": 0.0001, "loss": 4.3059, "loss/crossentropy": 2.535244107246399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24744001775979996, "step": 6798 }, { "epoch": 0.136, "grad_norm": 2.25, "grad_norm_var": 0.017704264322916666, "learning_rate": 0.0001, "loss": 4.4444, "loss/crossentropy": 2.0433249473571777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2502296343445778, "step": 6800 }, { "epoch": 0.13604, "grad_norm": 2.09375, "grad_norm_var": 0.0221588134765625, "learning_rate": 0.0001, "loss": 4.4619, "loss/crossentropy": 2.35608172416687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536633685231209, "step": 6802 }, { "epoch": 0.13608, "grad_norm": 2.1875, "grad_norm_var": 0.022362263997395833, "learning_rate": 0.0001, "loss": 4.4493, "loss/crossentropy": 2.1230265498161316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2276441976428032, "step": 6804 }, { "epoch": 0.13612, "grad_norm": 2.15625, "grad_norm_var": 0.015925089518229168, "learning_rate": 0.0001, "loss": 4.2125, "loss/crossentropy": 2.0186346769332886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22158686816692352, "step": 6806 }, { "epoch": 0.13616, "grad_norm": 2.28125, "grad_norm_var": 0.0151763916015625, "learning_rate": 0.0001, "loss": 4.6065, "loss/crossentropy": 2.4136343002319336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22668009996414185, "step": 6808 }, { "epoch": 0.1362, "grad_norm": 2.109375, "grad_norm_var": 0.012495930989583333, "learning_rate": 0.0001, "loss": 4.3372, "loss/crossentropy": 2.1241788268089294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23394256830215454, "step": 6810 }, { "epoch": 0.13624, "grad_norm": 2.4375, "grad_norm_var": 0.017145792643229168, "learning_rate": 0.0001, "loss": 4.6063, "loss/crossentropy": 1.9051874279975891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560805454850197, "step": 6812 }, { "epoch": 0.13628, "grad_norm": 2.234375, "grad_norm_var": 0.015458170572916667, "learning_rate": 0.0001, "loss": 4.2419, "loss/crossentropy": 1.9248363375663757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22457116842269897, "step": 6814 }, { "epoch": 0.13632, "grad_norm": 1.921875, "grad_norm_var": 0.022264607747395835, "learning_rate": 0.0001, "loss": 4.5953, "loss/crossentropy": 2.3065048456192017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.245137557387352, "step": 6816 }, { "epoch": 0.13636, "grad_norm": 2.234375, "grad_norm_var": 0.0152496337890625, "learning_rate": 0.0001, "loss": 4.6096, "loss/crossentropy": 2.152611255645752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22794383764266968, "step": 6818 }, { "epoch": 0.1364, "grad_norm": 2.09375, "grad_norm_var": 0.016242472330729167, "learning_rate": 0.0001, "loss": 4.129, "loss/crossentropy": 1.9548735618591309, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20764236897230148, "step": 6820 }, { "epoch": 0.13644, "grad_norm": 2.1875, "grad_norm_var": 0.016136678059895833, "learning_rate": 0.0001, "loss": 4.3501, "loss/crossentropy": 2.11979341506958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22793393582105637, "step": 6822 }, { "epoch": 0.13648, "grad_norm": 2.15625, "grad_norm_var": 0.016748046875, "learning_rate": 0.0001, "loss": 4.3338, "loss/crossentropy": 2.351949691772461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2366446554660797, "step": 6824 }, { "epoch": 0.13652, "grad_norm": 2.0625, "grad_norm_var": 0.018843587239583334, "learning_rate": 0.0001, "loss": 4.3764, "loss/crossentropy": 2.1197460889816284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24759536981582642, "step": 6826 }, { "epoch": 0.13656, "grad_norm": 2.0625, "grad_norm_var": 0.013264973958333334, "learning_rate": 0.0001, "loss": 4.1204, "loss/crossentropy": 1.8491687178611755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20777105540037155, "step": 6828 }, { "epoch": 0.1366, "grad_norm": 2.25, "grad_norm_var": 0.013459269205729167, "learning_rate": 0.0001, "loss": 4.4867, "loss/crossentropy": 1.964136004447937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2368694394826889, "step": 6830 }, { "epoch": 0.13664, "grad_norm": 2.140625, "grad_norm_var": 0.006859334309895834, "learning_rate": 0.0001, "loss": 4.469, "loss/crossentropy": 1.8988104462623596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19548720866441727, "step": 6832 }, { "epoch": 0.13668, "grad_norm": 2.125, "grad_norm_var": 0.0059855143229166664, "learning_rate": 0.0001, "loss": 4.3104, "loss/crossentropy": 1.757002353668213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21120281517505646, "step": 6834 }, { "epoch": 0.13672, "grad_norm": 2.109375, "grad_norm_var": 0.005952962239583333, "learning_rate": 0.0001, "loss": 4.4239, "loss/crossentropy": 1.9414420127868652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22160177677869797, "step": 6836 }, { "epoch": 0.13676, "grad_norm": 2.0625, "grad_norm_var": 0.0072174072265625, "learning_rate": 0.0001, "loss": 4.4294, "loss/crossentropy": 2.280028223991394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24696747958660126, "step": 6838 }, { "epoch": 0.1368, "grad_norm": 2.203125, "grad_norm_var": 0.006787109375, "learning_rate": 0.0001, "loss": 4.523, "loss/crossentropy": 2.106986403465271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24022039771080017, "step": 6840 }, { "epoch": 0.13684, "grad_norm": 2.15625, "grad_norm_var": 0.0349517822265625, "learning_rate": 0.0001, "loss": 4.2609, "loss/crossentropy": 1.9540700912475586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968660056591034, "step": 6842 }, { "epoch": 0.13688, "grad_norm": 2.0625, "grad_norm_var": 0.03394775390625, "learning_rate": 0.0001, "loss": 4.2181, "loss/crossentropy": 1.6771780252456665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20230162143707275, "step": 6844 }, { "epoch": 0.13692, "grad_norm": 2.5625, "grad_norm_var": 0.04419657389322917, "learning_rate": 0.0001, "loss": 4.7567, "loss/crossentropy": 2.059873402118683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.276339128613472, "step": 6846 }, { "epoch": 0.13696, "grad_norm": 2.0625, "grad_norm_var": 0.04684244791666667, "learning_rate": 0.0001, "loss": 4.2512, "loss/crossentropy": 1.7943353056907654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893974468111992, "step": 6848 }, { "epoch": 0.137, "grad_norm": 2.265625, "grad_norm_var": 0.046223958333333336, "learning_rate": 0.0001, "loss": 4.687, "loss/crossentropy": 2.314136028289795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24898302555084229, "step": 6850 }, { "epoch": 0.13704, "grad_norm": 2.328125, "grad_norm_var": 0.04383036295572917, "learning_rate": 0.0001, "loss": 4.4257, "loss/crossentropy": 2.0062466263771057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23169535398483276, "step": 6852 }, { "epoch": 0.13708, "grad_norm": 2.09375, "grad_norm_var": 0.044873046875, "learning_rate": 0.0001, "loss": 4.5787, "loss/crossentropy": 2.3600821495056152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25484780967235565, "step": 6854 }, { "epoch": 0.13712, "grad_norm": 2.28125, "grad_norm_var": 0.043675740559895836, "learning_rate": 0.0001, "loss": 4.2113, "loss/crossentropy": 1.885023295879364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.255520723760128, "step": 6856 }, { "epoch": 0.13716, "grad_norm": 2.234375, "grad_norm_var": 0.019806925455729166, "learning_rate": 0.0001, "loss": 4.6493, "loss/crossentropy": 2.2864162921905518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22469403594732285, "step": 6858 }, { "epoch": 0.1372, "grad_norm": 2.09375, "grad_norm_var": 0.018961588541666668, "learning_rate": 0.0001, "loss": 4.4017, "loss/crossentropy": 1.908643126487732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22016740590333939, "step": 6860 }, { "epoch": 0.13724, "grad_norm": 2.3125, "grad_norm_var": 0.010282389322916667, "learning_rate": 0.0001, "loss": 4.7255, "loss/crossentropy": 2.1028786301612854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23632919788360596, "step": 6862 }, { "epoch": 0.13728, "grad_norm": 2.546875, "grad_norm_var": 0.01539306640625, "learning_rate": 0.0001, "loss": 4.4043, "loss/crossentropy": 2.0363592505455017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22131157666444778, "step": 6864 }, { "epoch": 0.13732, "grad_norm": 2.171875, "grad_norm_var": 0.0158843994140625, "learning_rate": 0.0001, "loss": 4.4357, "loss/crossentropy": 2.030495524406433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.233994759619236, "step": 6866 }, { "epoch": 0.13736, "grad_norm": 2.046875, "grad_norm_var": 0.018033854166666665, "learning_rate": 0.0001, "loss": 4.3036, "loss/crossentropy": 1.6365603804588318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1909056007862091, "step": 6868 }, { "epoch": 0.1374, "grad_norm": 2.0625, "grad_norm_var": 0.015217081705729166, "learning_rate": 0.0001, "loss": 4.655, "loss/crossentropy": 2.205111026763916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2267644703388214, "step": 6870 }, { "epoch": 0.13744, "grad_norm": 2.0625, "grad_norm_var": 0.016437784830729166, "learning_rate": 0.0001, "loss": 4.4207, "loss/crossentropy": 2.0179646015167236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2307143434882164, "step": 6872 }, { "epoch": 0.13748, "grad_norm": 2.09375, "grad_norm_var": 0.018290201822916668, "learning_rate": 0.0001, "loss": 4.2154, "loss/crossentropy": 1.9697463512420654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2207925096154213, "step": 6874 }, { "epoch": 0.13752, "grad_norm": 2.265625, "grad_norm_var": 0.019189453125, "learning_rate": 0.0001, "loss": 4.5691, "loss/crossentropy": 2.186875820159912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25762687623500824, "step": 6876 }, { "epoch": 0.13756, "grad_norm": 2.109375, "grad_norm_var": 0.017867024739583334, "learning_rate": 0.0001, "loss": 4.2713, "loss/crossentropy": 2.3203837871551514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23470903187990189, "step": 6878 }, { "epoch": 0.1376, "grad_norm": 2.1875, "grad_norm_var": 0.0066640218098958336, "learning_rate": 0.0001, "loss": 4.4154, "loss/crossentropy": 2.2642472982406616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2454553246498108, "step": 6880 }, { "epoch": 0.13764, "grad_norm": 2.1875, "grad_norm_var": 0.008687337239583334, "learning_rate": 0.0001, "loss": 4.4076, "loss/crossentropy": 1.9313859343528748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21797804534435272, "step": 6882 }, { "epoch": 0.13768, "grad_norm": 3.4375, "grad_norm_var": 0.11204020182291667, "learning_rate": 0.0001, "loss": 4.7846, "loss/crossentropy": 2.5469977855682373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2651800066232681, "step": 6884 }, { "epoch": 0.13772, "grad_norm": 2.296875, "grad_norm_var": 0.11030171712239584, "learning_rate": 0.0001, "loss": 4.4893, "loss/crossentropy": 2.549328088760376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2566085457801819, "step": 6886 }, { "epoch": 0.13776, "grad_norm": 2.03125, "grad_norm_var": 0.1126617431640625, "learning_rate": 0.0001, "loss": 4.4927, "loss/crossentropy": 2.3094369769096375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24709751456975937, "step": 6888 }, { "epoch": 0.1378, "grad_norm": 2.109375, "grad_norm_var": 0.1097808837890625, "learning_rate": 0.0001, "loss": 4.2412, "loss/crossentropy": 1.5071046948432922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19776180386543274, "step": 6890 }, { "epoch": 0.13784, "grad_norm": 2.078125, "grad_norm_var": 0.1101226806640625, "learning_rate": 0.0001, "loss": 4.3705, "loss/crossentropy": 2.0064221620559692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20836234837770462, "step": 6892 }, { "epoch": 0.13788, "grad_norm": 2.078125, "grad_norm_var": 0.10816650390625, "learning_rate": 0.0001, "loss": 4.3608, "loss/crossentropy": 2.1216301321983337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22151901572942734, "step": 6894 }, { "epoch": 0.13792, "grad_norm": 2.25, "grad_norm_var": 0.1073150634765625, "learning_rate": 0.0001, "loss": 4.4168, "loss/crossentropy": 1.8417679071426392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21999332308769226, "step": 6896 }, { "epoch": 0.13796, "grad_norm": 2.328125, "grad_norm_var": 0.10695699055989584, "learning_rate": 0.0001, "loss": 4.7005, "loss/crossentropy": 2.4651769399642944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27029043436050415, "step": 6898 }, { "epoch": 0.138, "grad_norm": 2.21875, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 4.2985, "loss/crossentropy": 1.7225988507270813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20930497348308563, "step": 6900 }, { "epoch": 0.13804, "grad_norm": 2.34375, "grad_norm_var": 0.015360514322916666, "learning_rate": 0.0001, "loss": 4.1156, "loss/crossentropy": 2.1218297481536865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21276423335075378, "step": 6902 }, { "epoch": 0.13808, "grad_norm": 2.25, "grad_norm_var": 0.010856119791666667, "learning_rate": 0.0001, "loss": 4.2706, "loss/crossentropy": 2.040019452571869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2438269630074501, "step": 6904 }, { "epoch": 0.13812, "grad_norm": 15.8125, "grad_norm_var": 11.600536092122395, "learning_rate": 0.0001, "loss": 4.5041, "loss/crossentropy": 1.8229625225067139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22197778522968292, "step": 6906 }, { "epoch": 0.13816, "grad_norm": 2.25, "grad_norm_var": 11.543973795572917, "learning_rate": 0.0001, "loss": 4.7087, "loss/crossentropy": 2.453408360481262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24046239256858826, "step": 6908 }, { "epoch": 0.1382, "grad_norm": 2.046875, "grad_norm_var": 11.55152587890625, "learning_rate": 0.0001, "loss": 4.4806, "loss/crossentropy": 2.2724320888519287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23739346861839294, "step": 6910 }, { "epoch": 0.13824, "grad_norm": 2.125, "grad_norm_var": 11.562272135416666, "learning_rate": 0.0001, "loss": 4.2747, "loss/crossentropy": 2.2382686138153076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22692064195871353, "step": 6912 }, { "epoch": 0.13828, "grad_norm": 2.3125, "grad_norm_var": 11.54869384765625, "learning_rate": 0.0001, "loss": 4.6867, "loss/crossentropy": 2.021562337875366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24746088683605194, "step": 6914 }, { "epoch": 0.13832, "grad_norm": 2.21875, "grad_norm_var": 11.546556599934895, "learning_rate": 0.0001, "loss": 4.3247, "loss/crossentropy": 2.1071943044662476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144002616405487, "step": 6916 }, { "epoch": 0.13836, "grad_norm": 2.28125, "grad_norm_var": 11.550846354166667, "learning_rate": 0.0001, "loss": 4.2686, "loss/crossentropy": 1.9641517400741577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22584721446037292, "step": 6918 }, { "epoch": 0.1384, "grad_norm": 2.078125, "grad_norm_var": 11.559130859375, "learning_rate": 0.0001, "loss": 4.3194, "loss/crossentropy": 2.4430564641952515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24351171404123306, "step": 6920 }, { "epoch": 0.13844, "grad_norm": 2.125, "grad_norm_var": 0.022484334309895833, "learning_rate": 0.0001, "loss": 4.4202, "loss/crossentropy": 2.2237725257873535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2248261496424675, "step": 6922 }, { "epoch": 0.13848, "grad_norm": 2.09375, "grad_norm_var": 0.015412394205729167, "learning_rate": 0.0001, "loss": 4.2028, "loss/crossentropy": 1.7291913628578186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19805100560188293, "step": 6924 }, { "epoch": 0.13852, "grad_norm": 2.15625, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 4.3972, "loss/crossentropy": 1.807108223438263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20386626571416855, "step": 6926 }, { "epoch": 0.13856, "grad_norm": 2.28125, "grad_norm_var": 0.01422119140625, "learning_rate": 0.0001, "loss": 4.5188, "loss/crossentropy": 2.510676622390747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24666880816221237, "step": 6928 }, { "epoch": 0.1386, "grad_norm": 2.015625, "grad_norm_var": 0.007515462239583334, "learning_rate": 0.0001, "loss": 4.2006, "loss/crossentropy": 1.9420115947723389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2214776575565338, "step": 6930 }, { "epoch": 0.13864, "grad_norm": 2.3125, "grad_norm_var": 0.010837554931640625, "learning_rate": 0.0001, "loss": 4.4445, "loss/crossentropy": 2.2288190722465515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23719585686922073, "step": 6932 }, { "epoch": 0.13868, "grad_norm": 2.09375, "grad_norm_var": 0.011043294270833334, "learning_rate": 0.0001, "loss": 4.071, "loss/crossentropy": 2.04274183511734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21653369069099426, "step": 6934 }, { "epoch": 0.13872, "grad_norm": 2.296875, "grad_norm_var": 0.0121002197265625, "learning_rate": 0.0001, "loss": 4.4041, "loss/crossentropy": 1.9149779081344604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19941364973783493, "step": 6936 }, { "epoch": 0.13876, "grad_norm": 2.15625, "grad_norm_var": 0.011554972330729166, "learning_rate": 0.0001, "loss": 4.2577, "loss/crossentropy": 1.7983179092407227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18979590386152267, "step": 6938 }, { "epoch": 0.1388, "grad_norm": 2.296875, "grad_norm_var": 0.023164876302083335, "learning_rate": 0.0001, "loss": 4.3314, "loss/crossentropy": 2.1919915080070496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22652295976877213, "step": 6940 }, { "epoch": 0.13884, "grad_norm": 2.21875, "grad_norm_var": 0.023152669270833332, "learning_rate": 0.0001, "loss": 4.494, "loss/crossentropy": 2.0362821221351624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24296525120735168, "step": 6942 }, { "epoch": 0.13888, "grad_norm": 2.21875, "grad_norm_var": 0.023653157552083335, "learning_rate": 0.0001, "loss": 4.4135, "loss/crossentropy": 2.0371538400650024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2217850610613823, "step": 6944 }, { "epoch": 0.13892, "grad_norm": 2.5, "grad_norm_var": 0.0268310546875, "learning_rate": 0.0001, "loss": 4.3371, "loss/crossentropy": 1.9137988686561584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2218162938952446, "step": 6946 }, { "epoch": 0.13896, "grad_norm": 2.46875, "grad_norm_var": 0.03438898722330729, "learning_rate": 0.0001, "loss": 4.6521, "loss/crossentropy": 2.3215843439102173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2576482892036438, "step": 6948 }, { "epoch": 0.139, "grad_norm": 2.171875, "grad_norm_var": 0.026590983072916668, "learning_rate": 0.0001, "loss": 4.6004, "loss/crossentropy": 2.169154405593872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24629274010658264, "step": 6950 }, { "epoch": 0.13904, "grad_norm": 2.203125, "grad_norm_var": 0.03173421223958333, "learning_rate": 0.0001, "loss": 4.3549, "loss/crossentropy": 2.1355135440826416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316850870847702, "step": 6952 }, { "epoch": 0.13908, "grad_norm": 2.140625, "grad_norm_var": 0.03183186848958333, "learning_rate": 0.0001, "loss": 4.3462, "loss/crossentropy": 2.264985144138336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23467915505170822, "step": 6954 }, { "epoch": 0.13912, "grad_norm": 2.203125, "grad_norm_var": 0.025926717122395835, "learning_rate": 0.0001, "loss": 4.6559, "loss/crossentropy": 2.1007654666900635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2376948595046997, "step": 6956 }, { "epoch": 0.13916, "grad_norm": 2.28125, "grad_norm_var": 0.02603759765625, "learning_rate": 0.0001, "loss": 4.4871, "loss/crossentropy": 2.284608840942383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24811238050460815, "step": 6958 }, { "epoch": 0.1392, "grad_norm": 2.125, "grad_norm_var": 0.025911458333333335, "learning_rate": 0.0001, "loss": 4.2338, "loss/crossentropy": 1.657732367515564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19541934877634048, "step": 6960 }, { "epoch": 0.13924, "grad_norm": 2.171875, "grad_norm_var": 0.022391764322916667, "learning_rate": 0.0001, "loss": 4.3832, "loss/crossentropy": 1.9607325792312622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2269211858510971, "step": 6962 }, { "epoch": 0.13928, "grad_norm": 2.09375, "grad_norm_var": 0.006571451822916667, "learning_rate": 0.0001, "loss": 4.3759, "loss/crossentropy": 1.7454752326011658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21780573576688766, "step": 6964 }, { "epoch": 0.13932, "grad_norm": 2.421875, "grad_norm_var": 0.010350545247395834, "learning_rate": 0.0001, "loss": 4.7664, "loss/crossentropy": 2.001866638660431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24929454922676086, "step": 6966 }, { "epoch": 0.13936, "grad_norm": 2.21875, "grad_norm_var": 0.008006795247395834, "learning_rate": 0.0001, "loss": 4.4181, "loss/crossentropy": 1.9167855978012085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22114858031272888, "step": 6968 }, { "epoch": 0.1394, "grad_norm": 2.15625, "grad_norm_var": 0.008003743489583333, "learning_rate": 0.0001, "loss": 4.2284, "loss/crossentropy": 2.0324739813804626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23271320760250092, "step": 6970 }, { "epoch": 0.13944, "grad_norm": 2.25, "grad_norm_var": 0.016307576497395834, "learning_rate": 0.0001, "loss": 4.5375, "loss/crossentropy": 2.162013590335846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22832238674163818, "step": 6972 }, { "epoch": 0.13948, "grad_norm": 2.203125, "grad_norm_var": 0.018944295247395833, "learning_rate": 0.0001, "loss": 4.1406, "loss/crossentropy": 2.074672818183899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2402098923921585, "step": 6974 }, { "epoch": 0.13952, "grad_norm": 2.0625, "grad_norm_var": 0.019758097330729165, "learning_rate": 0.0001, "loss": 4.4221, "loss/crossentropy": 1.9982299208641052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21126113086938858, "step": 6976 }, { "epoch": 0.13956, "grad_norm": 2.171875, "grad_norm_var": 0.021751912434895833, "learning_rate": 0.0001, "loss": 4.3391, "loss/crossentropy": 1.944049894809723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2296794354915619, "step": 6978 }, { "epoch": 0.1396, "grad_norm": 2.21875, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 4.4344, "loss/crossentropy": 2.308506488800049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2589666247367859, "step": 6980 }, { "epoch": 0.13964, "grad_norm": 2.296875, "grad_norm_var": 0.017724609375, "learning_rate": 0.0001, "loss": 4.2867, "loss/crossentropy": 2.129163682460785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21905581653118134, "step": 6982 }, { "epoch": 0.13968, "grad_norm": 2.234375, "grad_norm_var": 0.017902628580729166, "learning_rate": 0.0001, "loss": 4.3023, "loss/crossentropy": 1.8560669422149658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2119361013174057, "step": 6984 }, { "epoch": 0.13972, "grad_norm": 2.140625, "grad_norm_var": 0.018602498372395835, "learning_rate": 0.0001, "loss": 4.1212, "loss/crossentropy": 1.8194095492362976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20872193574905396, "step": 6986 }, { "epoch": 0.13976, "grad_norm": 2.15625, "grad_norm_var": 0.005501302083333334, "learning_rate": 0.0001, "loss": 4.2612, "loss/crossentropy": 2.0200153589248657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20160746574401855, "step": 6988 }, { "epoch": 0.1398, "grad_norm": 2.359375, "grad_norm_var": 0.007298787434895833, "learning_rate": 0.0001, "loss": 4.2757, "loss/crossentropy": 1.982479751110077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23396103084087372, "step": 6990 }, { "epoch": 0.13984, "grad_norm": 2.171875, "grad_norm_var": 0.006494140625, "learning_rate": 0.0001, "loss": 4.5075, "loss/crossentropy": 2.17054283618927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23827539384365082, "step": 6992 }, { "epoch": 0.13988, "grad_norm": 2.171875, "grad_norm_var": 0.0056640625, "learning_rate": 0.0001, "loss": 4.1794, "loss/crossentropy": 1.619499921798706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1917721927165985, "step": 6994 }, { "epoch": 0.13992, "grad_norm": 2.15625, "grad_norm_var": 0.005353800455729167, "learning_rate": 0.0001, "loss": 4.3833, "loss/crossentropy": 2.1082500219345093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23768241703510284, "step": 6996 }, { "epoch": 0.13996, "grad_norm": 2.1875, "grad_norm_var": 0.004076131184895833, "learning_rate": 0.0001, "loss": 4.6731, "loss/crossentropy": 1.8480825424194336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21046122163534164, "step": 6998 }, { "epoch": 0.14, "grad_norm": 2.4375, "grad_norm_var": 0.008463541666666666, "learning_rate": 0.0001, "loss": 4.5285, "loss/crossentropy": 2.0547631978988647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22980494797229767, "step": 7000 }, { "epoch": 0.14004, "grad_norm": 2.15625, "grad_norm_var": 0.007835896809895833, "learning_rate": 0.0001, "loss": 4.4625, "loss/crossentropy": 2.0695141553878784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100546732544899, "step": 7002 }, { "epoch": 0.14008, "grad_norm": 1.9921875, "grad_norm_var": 0.012277984619140625, "learning_rate": 0.0001, "loss": 4.3716, "loss/crossentropy": 2.105263113975525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24552470445632935, "step": 7004 }, { "epoch": 0.14012, "grad_norm": 2.375, "grad_norm_var": 0.014427693684895833, "learning_rate": 0.0001, "loss": 4.3566, "loss/crossentropy": 2.03000670671463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24780434370040894, "step": 7006 }, { "epoch": 0.14016, "grad_norm": 2.40625, "grad_norm_var": 0.0173980712890625, "learning_rate": 0.0001, "loss": 4.4758, "loss/crossentropy": 2.288944959640503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2760937511920929, "step": 7008 }, { "epoch": 0.1402, "grad_norm": 1.9609375, "grad_norm_var": 0.06544570922851563, "learning_rate": 0.0001, "loss": 3.9326, "loss/crossentropy": 1.790147304534912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1905786320567131, "step": 7010 }, { "epoch": 0.14024, "grad_norm": 2.609375, "grad_norm_var": 0.07765884399414062, "learning_rate": 0.0001, "loss": 4.6364, "loss/crossentropy": 2.008346378803253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23138166218996048, "step": 7012 }, { "epoch": 0.14028, "grad_norm": 2.1875, "grad_norm_var": 0.07974014282226563, "learning_rate": 0.0001, "loss": 4.2304, "loss/crossentropy": 1.9694496393203735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21347863972187042, "step": 7014 }, { "epoch": 0.14032, "grad_norm": 2.203125, "grad_norm_var": 0.07948989868164062, "learning_rate": 0.0001, "loss": 4.3685, "loss/crossentropy": 2.0907286405563354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23349857330322266, "step": 7016 }, { "epoch": 0.14036, "grad_norm": 2.40625, "grad_norm_var": 0.07850316365559896, "learning_rate": 0.0001, "loss": 4.6454, "loss/crossentropy": 2.161414623260498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23841089010238647, "step": 7018 }, { "epoch": 0.1404, "grad_norm": 2.25, "grad_norm_var": 0.07315266927083333, "learning_rate": 0.0001, "loss": 4.4868, "loss/crossentropy": 2.1402887105941772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25493840873241425, "step": 7020 }, { "epoch": 0.14044, "grad_norm": 2.234375, "grad_norm_var": 0.06809666951497396, "learning_rate": 0.0001, "loss": 4.1248, "loss/crossentropy": 2.0703811049461365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24300381541252136, "step": 7022 }, { "epoch": 0.14048, "grad_norm": 2.53125, "grad_norm_var": 0.0716875712076823, "learning_rate": 0.0001, "loss": 4.6165, "loss/crossentropy": 2.152569532394409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23347805440425873, "step": 7024 }, { "epoch": 0.14052, "grad_norm": 2.171875, "grad_norm_var": 0.03453369140625, "learning_rate": 0.0001, "loss": 4.1642, "loss/crossentropy": 2.0831095576286316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2198955938220024, "step": 7026 }, { "epoch": 0.14056, "grad_norm": 2.234375, "grad_norm_var": 0.019498697916666665, "learning_rate": 0.0001, "loss": 4.4929, "loss/crossentropy": 2.1631508469581604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23971489816904068, "step": 7028 }, { "epoch": 0.1406, "grad_norm": 2.421875, "grad_norm_var": 0.019481404622395834, "learning_rate": 0.0001, "loss": 4.4687, "loss/crossentropy": 2.1683043241500854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2628757208585739, "step": 7030 }, { "epoch": 0.14064, "grad_norm": 2.1875, "grad_norm_var": 0.07377827962239583, "learning_rate": 0.0001, "loss": 4.522, "loss/crossentropy": 2.021001398563385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22922180593013763, "step": 7032 }, { "epoch": 0.14068, "grad_norm": 2.265625, "grad_norm_var": 0.0725494384765625, "learning_rate": 0.0001, "loss": 4.7729, "loss/crossentropy": 2.267430543899536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27525072544813156, "step": 7034 }, { "epoch": 0.14072, "grad_norm": 2.0625, "grad_norm_var": 0.0788726806640625, "learning_rate": 0.0001, "loss": 4.3118, "loss/crossentropy": 2.066729426383972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21959475427865982, "step": 7036 }, { "epoch": 0.14076, "grad_norm": 2.0625, "grad_norm_var": 0.08162333170572916, "learning_rate": 0.0001, "loss": 4.3203, "loss/crossentropy": 1.7972697019577026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21773608028888702, "step": 7038 }, { "epoch": 0.1408, "grad_norm": 2.171875, "grad_norm_var": 0.08068745930989583, "learning_rate": 0.0001, "loss": 4.0745, "loss/crossentropy": 1.751904845237732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19929176568984985, "step": 7040 }, { "epoch": 0.14084, "grad_norm": 2.171875, "grad_norm_var": 0.07649332682291667, "learning_rate": 0.0001, "loss": 4.4188, "loss/crossentropy": 1.8432873487472534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21235650032758713, "step": 7042 }, { "epoch": 0.14088, "grad_norm": 2.15625, "grad_norm_var": 0.07618815104166667, "learning_rate": 0.0001, "loss": 4.2343, "loss/crossentropy": 1.9589285850524902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21506928652524948, "step": 7044 }, { "epoch": 0.14092, "grad_norm": 2.046875, "grad_norm_var": 0.0788726806640625, "learning_rate": 0.0001, "loss": 4.229, "loss/crossentropy": 2.3658028841018677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23405643552541733, "step": 7046 }, { "epoch": 0.14096, "grad_norm": 2.140625, "grad_norm_var": 0.008837890625, "learning_rate": 0.0001, "loss": 4.3922, "loss/crossentropy": 2.088135540485382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22823208570480347, "step": 7048 }, { "epoch": 0.141, "grad_norm": 2.09375, "grad_norm_var": 0.006810506184895833, "learning_rate": 0.0001, "loss": 4.2367, "loss/crossentropy": 1.9309821724891663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21409911662340164, "step": 7050 }, { "epoch": 0.14104, "grad_norm": 2.1875, "grad_norm_var": 0.006636555989583333, "learning_rate": 0.0001, "loss": 4.5237, "loss/crossentropy": 2.5411492586135864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25017087161540985, "step": 7052 }, { "epoch": 0.14108, "grad_norm": 2.03125, "grad_norm_var": 0.007352701822916667, "learning_rate": 0.0001, "loss": 4.4998, "loss/crossentropy": 2.3210322856903076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23967693746089935, "step": 7054 }, { "epoch": 0.14112, "grad_norm": 2.046875, "grad_norm_var": 0.009370930989583333, "learning_rate": 0.0001, "loss": 4.4607, "loss/crossentropy": 2.054674744606018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23270255327224731, "step": 7056 }, { "epoch": 0.14116, "grad_norm": 2.125, "grad_norm_var": 0.008968098958333334, "learning_rate": 0.0001, "loss": 4.5423, "loss/crossentropy": 2.545789122581482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25560182332992554, "step": 7058 }, { "epoch": 0.1412, "grad_norm": 2.171875, "grad_norm_var": 0.008861287434895834, "learning_rate": 0.0001, "loss": 4.2682, "loss/crossentropy": 2.262348175048828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24763159453868866, "step": 7060 }, { "epoch": 0.14124, "grad_norm": 2.1875, "grad_norm_var": 0.008447265625, "learning_rate": 0.0001, "loss": 4.618, "loss/crossentropy": 2.1045475602149963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2472379505634308, "step": 7062 }, { "epoch": 0.14128, "grad_norm": 2.046875, "grad_norm_var": 0.0093170166015625, "learning_rate": 0.0001, "loss": 4.2312, "loss/crossentropy": 1.5632115006446838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19010942429304123, "step": 7064 }, { "epoch": 0.14132, "grad_norm": 2.171875, "grad_norm_var": 0.0116851806640625, "learning_rate": 0.0001, "loss": 4.2638, "loss/crossentropy": 2.0847875475883484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21371345967054367, "step": 7066 }, { "epoch": 0.14136, "grad_norm": 2.25, "grad_norm_var": 0.018016560872395834, "learning_rate": 0.0001, "loss": 4.5171, "loss/crossentropy": 2.2243804931640625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23065787553787231, "step": 7068 }, { "epoch": 0.1414, "grad_norm": 2.171875, "grad_norm_var": 0.016927083333333332, "learning_rate": 0.0001, "loss": 4.2812, "loss/crossentropy": 1.9477753639221191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21616356819868088, "step": 7070 }, { "epoch": 0.14144, "grad_norm": 2.5625, "grad_norm_var": 0.024169921875, "learning_rate": 0.0001, "loss": 4.7005, "loss/crossentropy": 2.2598072290420532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25795431435108185, "step": 7072 }, { "epoch": 0.14148, "grad_norm": 2.171875, "grad_norm_var": 0.022652180989583333, "learning_rate": 0.0001, "loss": 4.4345, "loss/crossentropy": 1.8817242980003357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21974974125623703, "step": 7074 }, { "epoch": 0.14152, "grad_norm": 2.0, "grad_norm_var": 0.0256744384765625, "learning_rate": 0.0001, "loss": 4.5688, "loss/crossentropy": 2.5275847911834717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24608048796653748, "step": 7076 }, { "epoch": 0.14156, "grad_norm": 2.125, "grad_norm_var": 0.0265045166015625, "learning_rate": 0.0001, "loss": 4.3937, "loss/crossentropy": 2.400865852832794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22960034757852554, "step": 7078 }, { "epoch": 0.1416, "grad_norm": 2.140625, "grad_norm_var": 0.0237457275390625, "learning_rate": 0.0001, "loss": 4.4005, "loss/crossentropy": 1.908901333808899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22271078824996948, "step": 7080 }, { "epoch": 0.14164, "grad_norm": 2.171875, "grad_norm_var": 0.02213134765625, "learning_rate": 0.0001, "loss": 4.1384, "loss/crossentropy": 2.3330780267715454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23944097012281418, "step": 7082 }, { "epoch": 0.14168, "grad_norm": 2.140625, "grad_norm_var": 0.018941243489583332, "learning_rate": 0.0001, "loss": 4.3516, "loss/crossentropy": 2.332213521003723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23576530069112778, "step": 7084 }, { "epoch": 0.14172, "grad_norm": 2.125, "grad_norm_var": 0.022435506184895832, "learning_rate": 0.0001, "loss": 4.4654, "loss/crossentropy": 2.2269067764282227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25341375917196274, "step": 7086 }, { "epoch": 0.14176, "grad_norm": 2.203125, "grad_norm_var": 0.01470947265625, "learning_rate": 0.0001, "loss": 4.2491, "loss/crossentropy": 2.461983561515808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24491792172193527, "step": 7088 }, { "epoch": 0.1418, "grad_norm": 1.9765625, "grad_norm_var": 0.013952382405598958, "learning_rate": 0.0001, "loss": 4.2348, "loss/crossentropy": 2.428719997406006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2394469603896141, "step": 7090 }, { "epoch": 0.14184, "grad_norm": 2.109375, "grad_norm_var": 0.011433664957682292, "learning_rate": 0.0001, "loss": 4.2242, "loss/crossentropy": 2.32351291179657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23441501706838608, "step": 7092 }, { "epoch": 0.14188, "grad_norm": 2.171875, "grad_norm_var": 0.011482493082682291, "learning_rate": 0.0001, "loss": 4.4155, "loss/crossentropy": 2.1165764331817627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2350979596376419, "step": 7094 }, { "epoch": 0.14192, "grad_norm": 2.3125, "grad_norm_var": 0.014288075764973958, "learning_rate": 0.0001, "loss": 4.4952, "loss/crossentropy": 2.150681734085083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2475891336798668, "step": 7096 }, { "epoch": 0.14196, "grad_norm": 2.046875, "grad_norm_var": 0.013079579671223958, "learning_rate": 0.0001, "loss": 4.2753, "loss/crossentropy": 2.038177013397217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2293965071439743, "step": 7098 }, { "epoch": 0.142, "grad_norm": 2.0625, "grad_norm_var": 0.014062245686848959, "learning_rate": 0.0001, "loss": 4.298, "loss/crossentropy": 1.899521827697754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20856370776891708, "step": 7100 }, { "epoch": 0.14204, "grad_norm": 2.296875, "grad_norm_var": 0.010593414306640625, "learning_rate": 0.0001, "loss": 4.3449, "loss/crossentropy": 1.9807924032211304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22756796330213547, "step": 7102 }, { "epoch": 0.14208, "grad_norm": 2.0625, "grad_norm_var": 0.009421539306640626, "learning_rate": 0.0001, "loss": 4.2893, "loss/crossentropy": 2.158667206764221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22594892233610153, "step": 7104 }, { "epoch": 0.14212, "grad_norm": 2.15625, "grad_norm_var": 0.007811482747395833, "learning_rate": 0.0001, "loss": 4.3751, "loss/crossentropy": 2.3133270144462585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22925584018230438, "step": 7106 }, { "epoch": 0.14216, "grad_norm": 2.015625, "grad_norm_var": 0.010107421875, "learning_rate": 0.0001, "loss": 4.2762, "loss/crossentropy": 1.9796301126480103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21650104224681854, "step": 7108 }, { "epoch": 0.1422, "grad_norm": 2.09375, "grad_norm_var": 0.010054524739583333, "learning_rate": 0.0001, "loss": 4.2778, "loss/crossentropy": 2.092659056186676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23407263308763504, "step": 7110 }, { "epoch": 0.14224, "grad_norm": 2.125, "grad_norm_var": 0.009056599934895833, "learning_rate": 0.0001, "loss": 4.6078, "loss/crossentropy": 1.970819890499115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262839823961258, "step": 7112 }, { "epoch": 0.14228, "grad_norm": 1.9921875, "grad_norm_var": 0.011923980712890626, "learning_rate": 0.0001, "loss": 4.0642, "loss/crossentropy": 1.5877107381820679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.182430237531662, "step": 7114 }, { "epoch": 0.14232, "grad_norm": 2.21875, "grad_norm_var": 0.020336659749348958, "learning_rate": 0.0001, "loss": 4.6997, "loss/crossentropy": 2.3208755254745483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25018931180238724, "step": 7116 }, { "epoch": 0.14236, "grad_norm": 2.203125, "grad_norm_var": 0.01904271443684896, "learning_rate": 0.0001, "loss": 4.4483, "loss/crossentropy": 2.1348973512649536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23516173660755157, "step": 7118 }, { "epoch": 0.1424, "grad_norm": 2.46875, "grad_norm_var": 0.02533543904622396, "learning_rate": 0.0001, "loss": 4.3443, "loss/crossentropy": 2.1442995071411133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23094037175178528, "step": 7120 }, { "epoch": 0.14244, "grad_norm": 2.1875, "grad_norm_var": 0.025608062744140625, "learning_rate": 0.0001, "loss": 4.3297, "loss/crossentropy": 2.24001145362854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2479296177625656, "step": 7122 }, { "epoch": 0.14248, "grad_norm": 2.734375, "grad_norm_var": 0.038917795817057295, "learning_rate": 0.0001, "loss": 4.4122, "loss/crossentropy": 1.8284733891487122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23670874536037445, "step": 7124 }, { "epoch": 0.14252, "grad_norm": 2.375, "grad_norm_var": 0.04146499633789062, "learning_rate": 0.0001, "loss": 4.6223, "loss/crossentropy": 2.1003533601760864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24235840141773224, "step": 7126 }, { "epoch": 0.14256, "grad_norm": 2.078125, "grad_norm_var": 0.04201024373372396, "learning_rate": 0.0001, "loss": 4.2591, "loss/crossentropy": 2.3661316633224487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.258208692073822, "step": 7128 }, { "epoch": 0.1426, "grad_norm": 2.109375, "grad_norm_var": 0.030989583333333334, "learning_rate": 0.0001, "loss": 4.3288, "loss/crossentropy": 2.2374593019485474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2543400973081589, "step": 7130 }, { "epoch": 0.14264, "grad_norm": 2.15625, "grad_norm_var": 0.02945556640625, "learning_rate": 0.0001, "loss": 4.5939, "loss/crossentropy": 1.9141342639923096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23257911205291748, "step": 7132 }, { "epoch": 0.14268, "grad_norm": 2.4375, "grad_norm_var": 0.031224568684895832, "learning_rate": 0.0001, "loss": 4.1893, "loss/crossentropy": 1.992666780948639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2298717424273491, "step": 7134 }, { "epoch": 0.14272, "grad_norm": 2.03125, "grad_norm_var": 0.033503214518229164, "learning_rate": 0.0001, "loss": 4.2665, "loss/crossentropy": 1.9794283509254456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112378552556038, "step": 7136 }, { "epoch": 0.14276, "grad_norm": 2.5, "grad_norm_var": 0.0382232666015625, "learning_rate": 0.0001, "loss": 4.3801, "loss/crossentropy": 2.1011139154434204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27278490364551544, "step": 7138 }, { "epoch": 0.1428, "grad_norm": 2.234375, "grad_norm_var": 0.02340087890625, "learning_rate": 0.0001, "loss": 4.6989, "loss/crossentropy": 2.3489880561828613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24445360153913498, "step": 7140 }, { "epoch": 0.14284, "grad_norm": 2.28125, "grad_norm_var": 0.0173736572265625, "learning_rate": 0.0001, "loss": 4.3418, "loss/crossentropy": 2.011172831058502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210151307284832, "step": 7142 }, { "epoch": 0.14288, "grad_norm": 2.25, "grad_norm_var": 0.017366536458333335, "learning_rate": 0.0001, "loss": 4.3488, "loss/crossentropy": 1.963642418384552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22939135879278183, "step": 7144 }, { "epoch": 0.14292, "grad_norm": 2.21875, "grad_norm_var": 0.016194661458333332, "learning_rate": 0.0001, "loss": 4.5166, "loss/crossentropy": 2.2739341259002686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23235367238521576, "step": 7146 }, { "epoch": 0.14296, "grad_norm": 2.21875, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 4.5977, "loss/crossentropy": 2.282576322555542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24144183099269867, "step": 7148 }, { "epoch": 0.143, "grad_norm": 2.078125, "grad_norm_var": 0.02329279581705729, "learning_rate": 0.0001, "loss": 4.141, "loss/crossentropy": 1.7847901582717896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19184929877519608, "step": 7150 }, { "epoch": 0.14304, "grad_norm": 2.15625, "grad_norm_var": 0.021740468343098958, "learning_rate": 0.0001, "loss": 4.3379, "loss/crossentropy": 2.165170907974243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.223430335521698, "step": 7152 }, { "epoch": 0.14308, "grad_norm": 2.15625, "grad_norm_var": 0.018873850504557293, "learning_rate": 0.0001, "loss": 4.3407, "loss/crossentropy": 2.0395787954330444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21662698686122894, "step": 7154 }, { "epoch": 0.14312, "grad_norm": 2.125, "grad_norm_var": 0.01907323201497396, "learning_rate": 0.0001, "loss": 4.4936, "loss/crossentropy": 2.014316141605377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23184175789356232, "step": 7156 }, { "epoch": 0.14316, "grad_norm": 2.296875, "grad_norm_var": 0.019419097900390626, "learning_rate": 0.0001, "loss": 4.5612, "loss/crossentropy": 2.2581117153167725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24055174738168716, "step": 7158 }, { "epoch": 0.1432, "grad_norm": 2.28125, "grad_norm_var": 0.021022288004557292, "learning_rate": 0.0001, "loss": 4.3853, "loss/crossentropy": 2.0905630588531494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2719188630580902, "step": 7160 }, { "epoch": 0.14324, "grad_norm": 2.25, "grad_norm_var": 0.025233713785807292, "learning_rate": 0.0001, "loss": 4.6485, "loss/crossentropy": 2.414529800415039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24188701063394547, "step": 7162 }, { "epoch": 0.14328, "grad_norm": 2.328125, "grad_norm_var": 0.02240778605143229, "learning_rate": 0.0001, "loss": 4.3605, "loss/crossentropy": 2.028432607650757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2300800457596779, "step": 7164 }, { "epoch": 0.14332, "grad_norm": 2.515625, "grad_norm_var": 0.03299153645833333, "learning_rate": 0.0001, "loss": 4.2947, "loss/crossentropy": 2.096144199371338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23311930894851685, "step": 7166 }, { "epoch": 0.14336, "grad_norm": 2.25, "grad_norm_var": 0.029622395833333332, "learning_rate": 0.0001, "loss": 4.4375, "loss/crossentropy": 2.259281277656555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25558799505233765, "step": 7168 }, { "epoch": 0.1434, "grad_norm": 2.25, "grad_norm_var": 0.0219390869140625, "learning_rate": 0.0001, "loss": 4.4364, "loss/crossentropy": 2.0766254663467407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22725434601306915, "step": 7170 }, { "epoch": 0.14344, "grad_norm": 2.15625, "grad_norm_var": 0.024312337239583332, "learning_rate": 0.0001, "loss": 4.4695, "loss/crossentropy": 2.26702618598938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22584324330091476, "step": 7172 }, { "epoch": 0.14348, "grad_norm": 2.203125, "grad_norm_var": 0.027534993489583333, "learning_rate": 0.0001, "loss": 4.3417, "loss/crossentropy": 2.1933096647262573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23658160120248795, "step": 7174 }, { "epoch": 0.14352, "grad_norm": 2.1875, "grad_norm_var": 0.027372233072916665, "learning_rate": 0.0001, "loss": 4.351, "loss/crossentropy": 2.2003660202026367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22349942475557327, "step": 7176 }, { "epoch": 0.14356, "grad_norm": 2.28125, "grad_norm_var": 0.025145467122395834, "learning_rate": 0.0001, "loss": 4.684, "loss/crossentropy": 2.4630067348480225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.253003865480423, "step": 7178 }, { "epoch": 0.1436, "grad_norm": 2.109375, "grad_norm_var": 0.027082316080729165, "learning_rate": 0.0001, "loss": 4.6875, "loss/crossentropy": 2.264480948448181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27604997158050537, "step": 7180 }, { "epoch": 0.14364, "grad_norm": 2.203125, "grad_norm_var": 0.00712890625, "learning_rate": 0.0001, "loss": 4.3064, "loss/crossentropy": 2.1641955375671387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22587595880031586, "step": 7182 }, { "epoch": 0.14368, "grad_norm": 2.296875, "grad_norm_var": 0.00943603515625, "learning_rate": 0.0001, "loss": 4.6137, "loss/crossentropy": 2.1432350873947144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24324018508195877, "step": 7184 }, { "epoch": 0.14372, "grad_norm": 1.9140625, "grad_norm_var": 0.014611562093098959, "learning_rate": 0.0001, "loss": 4.3394, "loss/crossentropy": 1.7448238134384155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1751260682940483, "step": 7186 }, { "epoch": 0.14376, "grad_norm": 2.1875, "grad_norm_var": 0.015295155843098958, "learning_rate": 0.0001, "loss": 4.3602, "loss/crossentropy": 2.3202184438705444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2314094603061676, "step": 7188 }, { "epoch": 0.1438, "grad_norm": 1.984375, "grad_norm_var": 0.019681549072265624, "learning_rate": 0.0001, "loss": 4.187, "loss/crossentropy": 1.970094919204712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24998464435338974, "step": 7190 }, { "epoch": 0.14384, "grad_norm": 2.125, "grad_norm_var": 0.02075780232747396, "learning_rate": 0.0001, "loss": 4.215, "loss/crossentropy": 2.1331114768981934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008385732769966, "step": 7192 }, { "epoch": 0.14388, "grad_norm": 2.03125, "grad_norm_var": 0.021345774332682293, "learning_rate": 0.0001, "loss": 4.3976, "loss/crossentropy": 2.1659106016159058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22314336150884628, "step": 7194 }, { "epoch": 0.14392, "grad_norm": 2.1875, "grad_norm_var": 0.018507639567057293, "learning_rate": 0.0001, "loss": 4.6324, "loss/crossentropy": 2.3382883071899414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26393643021583557, "step": 7196 }, { "epoch": 0.14396, "grad_norm": 2.34375, "grad_norm_var": 0.020499420166015626, "learning_rate": 0.0001, "loss": 4.7988, "loss/crossentropy": 2.1325554847717285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2457222416996956, "step": 7198 }, { "epoch": 0.144, "grad_norm": 2.078125, "grad_norm_var": 0.03144709269205729, "learning_rate": 0.0001, "loss": 4.3175, "loss/crossentropy": 1.7927106022834778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20284093916416168, "step": 7200 }, { "epoch": 0.14404, "grad_norm": 2.296875, "grad_norm_var": 0.03050715128580729, "learning_rate": 0.0001, "loss": 4.1676, "loss/crossentropy": 1.9799031615257263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23475942015647888, "step": 7202 }, { "epoch": 0.14408, "grad_norm": 2.234375, "grad_norm_var": 0.02939020792643229, "learning_rate": 0.0001, "loss": 4.2627, "loss/crossentropy": 2.0590370893478394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160736471414566, "step": 7204 }, { "epoch": 0.14412, "grad_norm": 2.34375, "grad_norm_var": 0.02800267537434896, "learning_rate": 0.0001, "loss": 4.4397, "loss/crossentropy": 1.9866149425506592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23413674533367157, "step": 7206 }, { "epoch": 0.14416, "grad_norm": 2.5625, "grad_norm_var": 0.03551610310872396, "learning_rate": 0.0001, "loss": 4.526, "loss/crossentropy": 2.1320748925209045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24469739198684692, "step": 7208 }, { "epoch": 0.1442, "grad_norm": 2.375, "grad_norm_var": 0.034708404541015626, "learning_rate": 0.0001, "loss": 4.5834, "loss/crossentropy": 2.225857973098755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23838083446025848, "step": 7210 }, { "epoch": 0.14424, "grad_norm": 2.078125, "grad_norm_var": 0.03794733683268229, "learning_rate": 0.0001, "loss": 3.9952, "loss/crossentropy": 1.9118528962135315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064301297068596, "step": 7212 }, { "epoch": 0.14428, "grad_norm": 2.21875, "grad_norm_var": 0.03806940714518229, "learning_rate": 0.0001, "loss": 4.182, "loss/crossentropy": 1.8142234086990356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172461450099945, "step": 7214 }, { "epoch": 0.14432, "grad_norm": 2.203125, "grad_norm_var": 0.02540868123372396, "learning_rate": 0.0001, "loss": 4.4446, "loss/crossentropy": 1.9308255910873413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2236044555902481, "step": 7216 }, { "epoch": 0.14436, "grad_norm": 2.3125, "grad_norm_var": 0.022391764322916667, "learning_rate": 0.0001, "loss": 4.4597, "loss/crossentropy": 1.9821211695671082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21110422909259796, "step": 7218 }, { "epoch": 0.1444, "grad_norm": 2.328125, "grad_norm_var": 0.023176066080729165, "learning_rate": 0.0001, "loss": 4.6732, "loss/crossentropy": 2.216045379638672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23433538526296616, "step": 7220 }, { "epoch": 0.14444, "grad_norm": 2.5, "grad_norm_var": 0.024442545572916665, "learning_rate": 0.0001, "loss": 4.6089, "loss/crossentropy": 2.2303662300109863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2390453889966011, "step": 7222 }, { "epoch": 0.14448, "grad_norm": 2.078125, "grad_norm_var": 0.017020670572916667, "learning_rate": 0.0001, "loss": 4.0805, "loss/crossentropy": 2.2152082920074463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22196897864341736, "step": 7224 }, { "epoch": 0.14452, "grad_norm": 2.09375, "grad_norm_var": 0.014860026041666667, "learning_rate": 0.0001, "loss": 4.2843, "loss/crossentropy": 2.134513795375824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21964208781719208, "step": 7226 }, { "epoch": 0.14456, "grad_norm": 2.0625, "grad_norm_var": 0.015262858072916666, "learning_rate": 0.0001, "loss": 4.0289, "loss/crossentropy": 1.6803861260414124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18217483162879944, "step": 7228 }, { "epoch": 0.1446, "grad_norm": 2.078125, "grad_norm_var": 0.0152252197265625, "learning_rate": 0.0001, "loss": 4.1353, "loss/crossentropy": 1.6597792506217957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20317095518112183, "step": 7230 }, { "epoch": 0.14464, "grad_norm": 2.015625, "grad_norm_var": 0.017704264322916666, "learning_rate": 0.0001, "loss": 4.2978, "loss/crossentropy": 1.776586651802063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21835225820541382, "step": 7232 }, { "epoch": 0.14468, "grad_norm": 2.03125, "grad_norm_var": 0.017154947916666666, "learning_rate": 0.0001, "loss": 4.1092, "loss/crossentropy": 1.7347259521484375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21964067220687866, "step": 7234 }, { "epoch": 0.14472, "grad_norm": 2.046875, "grad_norm_var": 0.015380859375, "learning_rate": 0.0001, "loss": 4.2069, "loss/crossentropy": 1.79097181558609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21776803582906723, "step": 7236 }, { "epoch": 0.14476, "grad_norm": 1.984375, "grad_norm_var": 0.0069244384765625, "learning_rate": 0.0001, "loss": 4.0335, "loss/crossentropy": 2.051329553127289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21669812500476837, "step": 7238 }, { "epoch": 0.1448, "grad_norm": 2.15625, "grad_norm_var": 0.0070220947265625, "learning_rate": 0.0001, "loss": 4.3206, "loss/crossentropy": 1.965324580669403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2283879667520523, "step": 7240 }, { "epoch": 0.14484, "grad_norm": 2.296875, "grad_norm_var": 0.010692342122395834, "learning_rate": 0.0001, "loss": 4.5952, "loss/crossentropy": 2.248784363269806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24906039983034134, "step": 7242 }, { "epoch": 0.14488, "grad_norm": 2.125, "grad_norm_var": 0.010529581705729167, "learning_rate": 0.0001, "loss": 4.3321, "loss/crossentropy": 1.9946890473365784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22583268582820892, "step": 7244 }, { "epoch": 0.14492, "grad_norm": 2.0625, "grad_norm_var": 0.010660807291666666, "learning_rate": 0.0001, "loss": 4.1436, "loss/crossentropy": 2.2306413650512695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22068945318460464, "step": 7246 }, { "epoch": 0.14496, "grad_norm": 2.1875, "grad_norm_var": 0.00758056640625, "learning_rate": 0.0001, "loss": 4.2159, "loss/crossentropy": 2.110253095626831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22045104205608368, "step": 7248 }, { "epoch": 0.145, "grad_norm": 2.21875, "grad_norm_var": 0.008210245768229167, "learning_rate": 0.0001, "loss": 4.0693, "loss/crossentropy": 1.928157925605774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043258175253868, "step": 7250 }, { "epoch": 0.14504, "grad_norm": 2.0, "grad_norm_var": 0.009110514322916667, "learning_rate": 0.0001, "loss": 4.2458, "loss/crossentropy": 2.2674691677093506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23258862644433975, "step": 7252 }, { "epoch": 0.14508, "grad_norm": 2.765625, "grad_norm_var": 0.03052978515625, "learning_rate": 0.0001, "loss": 4.5107, "loss/crossentropy": 2.2825024127960205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.263367660343647, "step": 7254 }, { "epoch": 0.14512, "grad_norm": 2.265625, "grad_norm_var": 0.040185546875, "learning_rate": 0.0001, "loss": 4.2394, "loss/crossentropy": 2.1546168327331543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23073332011699677, "step": 7256 }, { "epoch": 0.14516, "grad_norm": 2.34375, "grad_norm_var": 0.040816243489583334, "learning_rate": 0.0001, "loss": 4.5504, "loss/crossentropy": 2.0490044951438904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24161820113658905, "step": 7258 }, { "epoch": 0.1452, "grad_norm": 2.234375, "grad_norm_var": 0.04038798014322917, "learning_rate": 0.0001, "loss": 4.6468, "loss/crossentropy": 2.115446150302887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2537624090909958, "step": 7260 }, { "epoch": 0.14524, "grad_norm": 2.203125, "grad_norm_var": 0.22388407389322917, "learning_rate": 0.0001, "loss": 4.1457, "loss/crossentropy": 2.0302165746688843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21026766300201416, "step": 7262 }, { "epoch": 0.14528, "grad_norm": 2.171875, "grad_norm_var": 0.2228179931640625, "learning_rate": 0.0001, "loss": 4.4077, "loss/crossentropy": 2.102527379989624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2354147508740425, "step": 7264 }, { "epoch": 0.14532, "grad_norm": 2.171875, "grad_norm_var": 0.21614176432291668, "learning_rate": 0.0001, "loss": 4.1108, "loss/crossentropy": 2.0095282793045044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22679834067821503, "step": 7266 }, { "epoch": 0.14536, "grad_norm": 2.265625, "grad_norm_var": 0.20706278483072918, "learning_rate": 0.0001, "loss": 4.3849, "loss/crossentropy": 1.8988603353500366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21238050609827042, "step": 7268 }, { "epoch": 0.1454, "grad_norm": 2.203125, "grad_norm_var": 0.19975484212239583, "learning_rate": 0.0001, "loss": 4.6614, "loss/crossentropy": 2.186660885810852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24453039467334747, "step": 7270 }, { "epoch": 0.14544, "grad_norm": 2.0625, "grad_norm_var": 0.20244852701822916, "learning_rate": 0.0001, "loss": 4.1409, "loss/crossentropy": 1.8927155137062073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147793024778366, "step": 7272 }, { "epoch": 0.14548, "grad_norm": 2.046875, "grad_norm_var": 0.20608317057291667, "learning_rate": 0.0001, "loss": 4.1375, "loss/crossentropy": 1.8969642519950867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21910040825605392, "step": 7274 }, { "epoch": 0.14552, "grad_norm": 2.21875, "grad_norm_var": 0.20545247395833333, "learning_rate": 0.0001, "loss": 4.3325, "loss/crossentropy": 2.090053617954254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22576116025447845, "step": 7276 }, { "epoch": 0.14556, "grad_norm": 2.171875, "grad_norm_var": 0.010480753580729167, "learning_rate": 0.0001, "loss": 4.7263, "loss/crossentropy": 2.1606650352478027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22937766462564468, "step": 7278 }, { "epoch": 0.1456, "grad_norm": 2.015625, "grad_norm_var": 0.013451131184895833, "learning_rate": 0.0001, "loss": 4.0924, "loss/crossentropy": 1.9946333765983582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20145532488822937, "step": 7280 }, { "epoch": 0.14564, "grad_norm": 2.296875, "grad_norm_var": 0.013834635416666666, "learning_rate": 0.0001, "loss": 4.6519, "loss/crossentropy": 2.0958545207977295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22669509798288345, "step": 7282 }, { "epoch": 0.14568, "grad_norm": 2.328125, "grad_norm_var": 0.016109212239583334, "learning_rate": 0.0001, "loss": 4.4224, "loss/crossentropy": 2.0515894889831543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2520884945988655, "step": 7284 }, { "epoch": 0.14572, "grad_norm": 2.171875, "grad_norm_var": 0.016063435872395834, "learning_rate": 0.0001, "loss": 4.1572, "loss/crossentropy": 2.034587264060974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21523287147283554, "step": 7286 }, { "epoch": 0.14576, "grad_norm": 2.140625, "grad_norm_var": 0.016080729166666665, "learning_rate": 0.0001, "loss": 3.8649, "loss/crossentropy": 1.6578314900398254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19603776931762695, "step": 7288 }, { "epoch": 0.1458, "grad_norm": 2.109375, "grad_norm_var": 0.014167277018229167, "learning_rate": 0.0001, "loss": 4.2443, "loss/crossentropy": 2.0019100308418274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22880420833826065, "step": 7290 }, { "epoch": 0.14584, "grad_norm": 2.046875, "grad_norm_var": 0.018723297119140624, "learning_rate": 0.0001, "loss": 3.981, "loss/crossentropy": 2.068517565727234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19675085693597794, "step": 7292 }, { "epoch": 0.14588, "grad_norm": 2.203125, "grad_norm_var": 0.013734690348307292, "learning_rate": 0.0001, "loss": 4.4037, "loss/crossentropy": 2.000797212123871, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20464950054883957, "step": 7294 }, { "epoch": 0.14592, "grad_norm": 2.125, "grad_norm_var": 0.013734690348307292, "learning_rate": 0.0001, "loss": 4.4132, "loss/crossentropy": 2.1914668679237366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2406519278883934, "step": 7296 }, { "epoch": 0.14596, "grad_norm": 2.390625, "grad_norm_var": 0.01685358683268229, "learning_rate": 0.0001, "loss": 4.4737, "loss/crossentropy": 2.123211979866028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.250032439827919, "step": 7298 }, { "epoch": 0.146, "grad_norm": 2.625, "grad_norm_var": 0.028507232666015625, "learning_rate": 0.0001, "loss": 4.718, "loss/crossentropy": 2.0686148405075073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22581970691680908, "step": 7300 }, { "epoch": 0.14604, "grad_norm": 2.25, "grad_norm_var": 0.029288482666015626, "learning_rate": 0.0001, "loss": 4.2295, "loss/crossentropy": 2.141907751560211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21503636240959167, "step": 7302 }, { "epoch": 0.14608, "grad_norm": 2.09375, "grad_norm_var": 0.028436024983723957, "learning_rate": 0.0001, "loss": 4.1493, "loss/crossentropy": 1.6741206645965576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20490698516368866, "step": 7304 }, { "epoch": 0.14612, "grad_norm": 2.09375, "grad_norm_var": 0.027854156494140626, "learning_rate": 0.0001, "loss": 4.1259, "loss/crossentropy": 1.8094561696052551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21328043192625046, "step": 7306 }, { "epoch": 0.14616, "grad_norm": 2.109375, "grad_norm_var": 0.020970662434895832, "learning_rate": 0.0001, "loss": 4.3715, "loss/crossentropy": 2.204083800315857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24204879999160767, "step": 7308 }, { "epoch": 0.1462, "grad_norm": 2.234375, "grad_norm_var": 0.021126302083333333, "learning_rate": 0.0001, "loss": 4.2614, "loss/crossentropy": 2.0166819095611572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2219880372285843, "step": 7310 }, { "epoch": 0.14624, "grad_norm": 2.25, "grad_norm_var": 0.022684733072916668, "learning_rate": 0.0001, "loss": 4.3768, "loss/crossentropy": 2.4667757749557495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2880199924111366, "step": 7312 }, { "epoch": 0.14628, "grad_norm": 2.1875, "grad_norm_var": 0.019287109375, "learning_rate": 0.0001, "loss": 4.2005, "loss/crossentropy": 2.1497310400009155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21792854368686676, "step": 7314 }, { "epoch": 0.14632, "grad_norm": 2.25, "grad_norm_var": 0.0062896728515625, "learning_rate": 0.0001, "loss": 4.3911, "loss/crossentropy": 2.179584264755249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23200812935829163, "step": 7316 }, { "epoch": 0.14636, "grad_norm": 6.0625, "grad_norm_var": 0.9582590738932292, "learning_rate": 0.0001, "loss": 4.1545, "loss/crossentropy": 1.4067250490188599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204126738011837, "step": 7318 }, { "epoch": 0.1464, "grad_norm": 2.390625, "grad_norm_var": 0.9473592122395833, "learning_rate": 0.0001, "loss": 4.2439, "loss/crossentropy": 1.9879435896873474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.247541606426239, "step": 7320 }, { "epoch": 0.14644, "grad_norm": 2.28125, "grad_norm_var": 0.9377919514973958, "learning_rate": 0.0001, "loss": 4.1795, "loss/crossentropy": 1.62649005651474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2054424211382866, "step": 7322 }, { "epoch": 0.14648, "grad_norm": 2.15625, "grad_norm_var": 0.9344228108723959, "learning_rate": 0.0001, "loss": 4.1479, "loss/crossentropy": 1.9012435674667358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240126132965088, "step": 7324 }, { "epoch": 0.14652, "grad_norm": 2.03125, "grad_norm_var": 0.948193359375, "learning_rate": 0.0001, "loss": 3.6841, "loss/crossentropy": 1.8239200115203857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19691675901412964, "step": 7326 }, { "epoch": 0.14656, "grad_norm": 2.171875, "grad_norm_var": 0.94068603515625, "learning_rate": 0.0001, "loss": 4.2223, "loss/crossentropy": 1.9689037799835205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21301231533288956, "step": 7328 }, { "epoch": 0.1466, "grad_norm": 2.09375, "grad_norm_var": 0.9363433837890625, "learning_rate": 0.0001, "loss": 4.3733, "loss/crossentropy": 2.1064560413360596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23990632593631744, "step": 7330 }, { "epoch": 0.14664, "grad_norm": 2.109375, "grad_norm_var": 0.9397125244140625, "learning_rate": 0.0001, "loss": 4.2765, "loss/crossentropy": 2.0871987342834473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23108436167240143, "step": 7332 }, { "epoch": 0.14668, "grad_norm": 2.265625, "grad_norm_var": 0.0209869384765625, "learning_rate": 0.0001, "loss": 4.4517, "loss/crossentropy": 2.3061007857322693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24406791478395462, "step": 7334 }, { "epoch": 0.14672, "grad_norm": 2.296875, "grad_norm_var": 0.016943359375, "learning_rate": 0.0001, "loss": 4.4278, "loss/crossentropy": 1.9471244812011719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20859277993440628, "step": 7336 }, { "epoch": 0.14676, "grad_norm": 2.28125, "grad_norm_var": 0.0131744384765625, "learning_rate": 0.0001, "loss": 4.188, "loss/crossentropy": 2.0628740191459656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25501881539821625, "step": 7338 }, { "epoch": 0.1468, "grad_norm": 1.9921875, "grad_norm_var": 0.014481353759765624, "learning_rate": 0.0001, "loss": 4.4329, "loss/crossentropy": 1.8065250515937805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20222238451242447, "step": 7340 }, { "epoch": 0.14684, "grad_norm": 2.109375, "grad_norm_var": 0.012910715738932292, "learning_rate": 0.0001, "loss": 4.5431, "loss/crossentropy": 2.134244918823242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22319403290748596, "step": 7342 }, { "epoch": 0.14688, "grad_norm": 2.109375, "grad_norm_var": 0.012359364827473959, "learning_rate": 0.0001, "loss": 4.3098, "loss/crossentropy": 2.5807924270629883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26865454018116, "step": 7344 }, { "epoch": 0.14692, "grad_norm": 2.203125, "grad_norm_var": 0.015421295166015625, "learning_rate": 0.0001, "loss": 4.684, "loss/crossentropy": 2.4128278493881226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25771988928318024, "step": 7346 }, { "epoch": 0.14696, "grad_norm": 2.046875, "grad_norm_var": 0.017561594645182293, "learning_rate": 0.0001, "loss": 4.0915, "loss/crossentropy": 1.7323983907699585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146565169095993, "step": 7348 }, { "epoch": 0.147, "grad_norm": 2.125, "grad_norm_var": 0.015553538004557292, "learning_rate": 0.0001, "loss": 4.5511, "loss/crossentropy": 2.036192536354065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21692586690187454, "step": 7350 }, { "epoch": 0.14704, "grad_norm": 2.09375, "grad_norm_var": 0.012320709228515626, "learning_rate": 0.0001, "loss": 4.4173, "loss/crossentropy": 1.9586528539657593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21543999761343002, "step": 7352 }, { "epoch": 0.14708, "grad_norm": 2.234375, "grad_norm_var": 0.011805979410807292, "learning_rate": 0.0001, "loss": 4.4506, "loss/crossentropy": 2.3444113731384277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24080512672662735, "step": 7354 }, { "epoch": 0.14712, "grad_norm": 2.15625, "grad_norm_var": 0.0097076416015625, "learning_rate": 0.0001, "loss": 4.4865, "loss/crossentropy": 2.3060439825057983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24567800760269165, "step": 7356 }, { "epoch": 0.14716, "grad_norm": 2.359375, "grad_norm_var": 0.01279296875, "learning_rate": 0.0001, "loss": 4.5975, "loss/crossentropy": 2.2267106771469116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2345375493168831, "step": 7358 }, { "epoch": 0.1472, "grad_norm": 2.359375, "grad_norm_var": 0.014481608072916667, "learning_rate": 0.0001, "loss": 4.7052, "loss/crossentropy": 2.2470518350601196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2417411357164383, "step": 7360 }, { "epoch": 0.14724, "grad_norm": 1.9765625, "grad_norm_var": 0.012953440348307291, "learning_rate": 0.0001, "loss": 4.3299, "loss/crossentropy": 2.200543165206909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24434109032154083, "step": 7362 }, { "epoch": 0.14728, "grad_norm": 2.203125, "grad_norm_var": 0.010227203369140625, "learning_rate": 0.0001, "loss": 4.3282, "loss/crossentropy": 1.995844304561615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22591491788625717, "step": 7364 }, { "epoch": 0.14732, "grad_norm": 2.046875, "grad_norm_var": 0.011580149332682291, "learning_rate": 0.0001, "loss": 4.3354, "loss/crossentropy": 1.9180658459663391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22776676714420319, "step": 7366 }, { "epoch": 0.14736, "grad_norm": 2.078125, "grad_norm_var": 0.011840565999348959, "learning_rate": 0.0001, "loss": 4.3662, "loss/crossentropy": 2.473931312561035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26284685730934143, "step": 7368 }, { "epoch": 0.1474, "grad_norm": 2.109375, "grad_norm_var": 0.011744944254557292, "learning_rate": 0.0001, "loss": 4.3167, "loss/crossentropy": 2.0392738580703735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22323766350746155, "step": 7370 }, { "epoch": 0.14744, "grad_norm": 2.203125, "grad_norm_var": 0.012094879150390625, "learning_rate": 0.0001, "loss": 4.3032, "loss/crossentropy": 1.9847410917282104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22324485331773758, "step": 7372 }, { "epoch": 0.14748, "grad_norm": 2.171875, "grad_norm_var": 0.008459218343098958, "learning_rate": 0.0001, "loss": 4.2794, "loss/crossentropy": 1.930326521396637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24155349284410477, "step": 7374 }, { "epoch": 0.14752, "grad_norm": 2.09375, "grad_norm_var": 0.0061724344889322914, "learning_rate": 0.0001, "loss": 4.0648, "loss/crossentropy": 1.825449824333191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087552770972252, "step": 7376 }, { "epoch": 0.14756, "grad_norm": 2.15625, "grad_norm_var": 0.004325358072916666, "learning_rate": 0.0001, "loss": 4.2405, "loss/crossentropy": 2.156645655632019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2519562169909477, "step": 7378 }, { "epoch": 0.1476, "grad_norm": 2.40625, "grad_norm_var": 0.008333333333333333, "learning_rate": 0.0001, "loss": 4.5316, "loss/crossentropy": 2.255813479423523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23737946152687073, "step": 7380 }, { "epoch": 0.14764, "grad_norm": 2.109375, "grad_norm_var": 0.00797119140625, "learning_rate": 0.0001, "loss": 4.5795, "loss/crossentropy": 2.47933566570282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23890959471464157, "step": 7382 }, { "epoch": 0.14768, "grad_norm": 2.09375, "grad_norm_var": 0.007710774739583333, "learning_rate": 0.0001, "loss": 4.5328, "loss/crossentropy": 2.139566659927368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22334300726652145, "step": 7384 }, { "epoch": 0.14772, "grad_norm": 2.125, "grad_norm_var": 0.007673136393229167, "learning_rate": 0.0001, "loss": 4.1581, "loss/crossentropy": 1.6182149052619934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19345169514417648, "step": 7386 }, { "epoch": 0.14776, "grad_norm": 2.40625, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 4.4571, "loss/crossentropy": 1.9984004497528076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24823245406150818, "step": 7388 }, { "epoch": 0.1478, "grad_norm": 2.171875, "grad_norm_var": 0.010445149739583333, "learning_rate": 0.0001, "loss": 4.3218, "loss/crossentropy": 2.5892586708068848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2489030361175537, "step": 7390 }, { "epoch": 0.14784, "grad_norm": 2.140625, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.6077, "loss/crossentropy": 2.3723479509353638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23511765897274017, "step": 7392 }, { "epoch": 0.14788, "grad_norm": 2.484375, "grad_norm_var": 0.0145660400390625, "learning_rate": 0.0001, "loss": 4.3902, "loss/crossentropy": 2.001940071582794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22256013005971909, "step": 7394 }, { "epoch": 0.14792, "grad_norm": 2.0625, "grad_norm_var": 0.013850911458333334, "learning_rate": 0.0001, "loss": 4.4422, "loss/crossentropy": 2.2255555391311646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23905682563781738, "step": 7396 }, { "epoch": 0.14796, "grad_norm": 2.28125, "grad_norm_var": 0.013277180989583333, "learning_rate": 0.0001, "loss": 4.6217, "loss/crossentropy": 2.5670583248138428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25223178416490555, "step": 7398 }, { "epoch": 0.148, "grad_norm": 2.203125, "grad_norm_var": 2.3002278645833334, "learning_rate": 0.0001, "loss": 4.6365, "loss/crossentropy": 2.16109037399292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2601509317755699, "step": 7400 }, { "epoch": 0.14804, "grad_norm": 2.09375, "grad_norm_var": 2.3012847900390625, "learning_rate": 0.0001, "loss": 4.4169, "loss/crossentropy": 2.1484315395355225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21474219858646393, "step": 7402 }, { "epoch": 0.14808, "grad_norm": 4.3125, "grad_norm_var": 2.48385009765625, "learning_rate": 0.0001, "loss": 4.7452, "loss/crossentropy": 2.183099091053009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25882233679294586, "step": 7404 }, { "epoch": 0.14812, "grad_norm": 2.0625, "grad_norm_var": 2.491097005208333, "learning_rate": 0.0001, "loss": 3.9739, "loss/crossentropy": 1.795831561088562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20870305597782135, "step": 7406 }, { "epoch": 0.14816, "grad_norm": 2.09375, "grad_norm_var": 2.5058553059895834, "learning_rate": 0.0001, "loss": 4.3433, "loss/crossentropy": 2.202280640602112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150556966662407, "step": 7408 }, { "epoch": 0.1482, "grad_norm": 2.03125, "grad_norm_var": 2.5274251302083335, "learning_rate": 0.0001, "loss": 4.3063, "loss/crossentropy": 2.2616937160491943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24934212118387222, "step": 7410 }, { "epoch": 0.14824, "grad_norm": 2.171875, "grad_norm_var": 2.51627197265625, "learning_rate": 0.0001, "loss": 4.3303, "loss/crossentropy": 1.909091055393219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23191271722316742, "step": 7412 }, { "epoch": 0.14828, "grad_norm": 2.09375, "grad_norm_var": 2.5269765218098956, "learning_rate": 0.0001, "loss": 4.4224, "loss/crossentropy": 2.1383588314056396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22893162816762924, "step": 7414 }, { "epoch": 0.14832, "grad_norm": 2.234375, "grad_norm_var": 0.2983062744140625, "learning_rate": 0.0001, "loss": 4.3664, "loss/crossentropy": 2.0628907680511475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23075110465288162, "step": 7416 }, { "epoch": 0.14836, "grad_norm": 2.15625, "grad_norm_var": 0.29704488118489586, "learning_rate": 0.0001, "loss": 4.2109, "loss/crossentropy": 1.8996745347976685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21171879768371582, "step": 7418 }, { "epoch": 0.1484, "grad_norm": 2.046875, "grad_norm_var": 0.0032867431640625, "learning_rate": 0.0001, "loss": 4.4241, "loss/crossentropy": 2.2367645502090454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2563689202070236, "step": 7420 }, { "epoch": 0.14844, "grad_norm": 2.265625, "grad_norm_var": 0.003934733072916667, "learning_rate": 0.0001, "loss": 4.2326, "loss/crossentropy": 2.274489164352417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2462785318493843, "step": 7422 }, { "epoch": 0.14848, "grad_norm": 2.328125, "grad_norm_var": 0.00562744140625, "learning_rate": 0.0001, "loss": 4.5288, "loss/crossentropy": 2.227620482444763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24729500710964203, "step": 7424 }, { "epoch": 0.14852, "grad_norm": 2.0, "grad_norm_var": 0.006864420572916667, "learning_rate": 0.0001, "loss": 4.0913, "loss/crossentropy": 1.9911785125732422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.216292105615139, "step": 7426 }, { "epoch": 0.14856, "grad_norm": 2.15625, "grad_norm_var": 0.008199055989583334, "learning_rate": 0.0001, "loss": 4.4071, "loss/crossentropy": 1.8715736865997314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2289058193564415, "step": 7428 }, { "epoch": 0.1486, "grad_norm": 2.125, "grad_norm_var": 0.007991536458333334, "learning_rate": 0.0001, "loss": 4.4671, "loss/crossentropy": 2.0192378759384155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327374964952469, "step": 7430 }, { "epoch": 0.14864, "grad_norm": 2.046875, "grad_norm_var": 0.008226521809895833, "learning_rate": 0.0001, "loss": 4.213, "loss/crossentropy": 1.9407023191452026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22586089372634888, "step": 7432 }, { "epoch": 0.14868, "grad_norm": 2.203125, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 4.7816, "loss/crossentropy": 2.6349592208862305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2605717331171036, "step": 7434 }, { "epoch": 0.14872, "grad_norm": 2.28125, "grad_norm_var": 0.009989420572916666, "learning_rate": 0.0001, "loss": 4.5511, "loss/crossentropy": 2.3015077114105225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2292959839105606, "step": 7436 }, { "epoch": 0.14876, "grad_norm": 2.1875, "grad_norm_var": 0.013231404622395833, "learning_rate": 0.0001, "loss": 4.1663, "loss/crossentropy": 2.081148624420166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21435046195983887, "step": 7438 }, { "epoch": 0.1488, "grad_norm": 2.109375, "grad_norm_var": 0.011735026041666667, "learning_rate": 0.0001, "loss": 4.4342, "loss/crossentropy": 2.235422372817993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23794984817504883, "step": 7440 }, { "epoch": 0.14884, "grad_norm": 2.171875, "grad_norm_var": 0.011295572916666666, "learning_rate": 0.0001, "loss": 4.4595, "loss/crossentropy": 2.1423263549804688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200494110584259, "step": 7442 }, { "epoch": 0.14888, "grad_norm": 2.421875, "grad_norm_var": 0.018342081705729166, "learning_rate": 0.0001, "loss": 4.514, "loss/crossentropy": 2.3156672716140747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25282321125268936, "step": 7444 }, { "epoch": 0.14892, "grad_norm": 2.15625, "grad_norm_var": 0.020051066080729166, "learning_rate": 0.0001, "loss": 4.3107, "loss/crossentropy": 1.7777396440505981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104710191488266, "step": 7446 }, { "epoch": 0.14896, "grad_norm": 2.09375, "grad_norm_var": 0.023341623942057292, "learning_rate": 0.0001, "loss": 3.874, "loss/crossentropy": 1.8015541434288025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1976182758808136, "step": 7448 }, { "epoch": 0.149, "grad_norm": 2.578125, "grad_norm_var": 0.04948298136393229, "learning_rate": 0.0001, "loss": 4.5647, "loss/crossentropy": 2.36995792388916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23767317831516266, "step": 7450 }, { "epoch": 0.14904, "grad_norm": 2.1875, "grad_norm_var": 0.050142161051432294, "learning_rate": 0.0001, "loss": 4.7568, "loss/crossentropy": 2.2867971062660217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2666979879140854, "step": 7452 }, { "epoch": 0.14908, "grad_norm": 2.328125, "grad_norm_var": 0.043794504801432294, "learning_rate": 0.0001, "loss": 4.773, "loss/crossentropy": 2.247913956642151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122594192624092, "step": 7454 }, { "epoch": 0.14912, "grad_norm": 2.109375, "grad_norm_var": 0.04197362263997396, "learning_rate": 0.0001, "loss": 4.273, "loss/crossentropy": 2.0020886063575745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22448039799928665, "step": 7456 }, { "epoch": 0.14916, "grad_norm": 2.15625, "grad_norm_var": 0.043702952067057294, "learning_rate": 0.0001, "loss": 4.3139, "loss/crossentropy": 2.2290679216384888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21489766240119934, "step": 7458 }, { "epoch": 0.1492, "grad_norm": 2.609375, "grad_norm_var": 0.049344635009765624, "learning_rate": 0.0001, "loss": 4.5919, "loss/crossentropy": 2.0447877049446106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22640856355428696, "step": 7460 }, { "epoch": 0.14924, "grad_norm": 2.3125, "grad_norm_var": 0.045904286702473956, "learning_rate": 0.0001, "loss": 4.4614, "loss/crossentropy": 2.395468831062317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2395479902625084, "step": 7462 }, { "epoch": 0.14928, "grad_norm": 2.0625, "grad_norm_var": 0.04019775390625, "learning_rate": 0.0001, "loss": 4.1552, "loss/crossentropy": 2.117182433605194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22302603721618652, "step": 7464 }, { "epoch": 0.14932, "grad_norm": 2.125, "grad_norm_var": 0.019498697916666665, "learning_rate": 0.0001, "loss": 4.4046, "loss/crossentropy": 2.206045985221863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2548774778842926, "step": 7466 }, { "epoch": 0.14936, "grad_norm": 2.046875, "grad_norm_var": 0.0194244384765625, "learning_rate": 0.0001, "loss": 4.1531, "loss/crossentropy": 1.9933450818061829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22455794364213943, "step": 7468 }, { "epoch": 0.1494, "grad_norm": 2.125, "grad_norm_var": 0.017731730143229166, "learning_rate": 0.0001, "loss": 4.2744, "loss/crossentropy": 2.216577649116516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24432705342769623, "step": 7470 }, { "epoch": 0.14944, "grad_norm": 2.25, "grad_norm_var": 0.018163045247395832, "learning_rate": 0.0001, "loss": 4.6072, "loss/crossentropy": 2.4282405376434326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2599884122610092, "step": 7472 }, { "epoch": 0.14948, "grad_norm": 2.140625, "grad_norm_var": 0.018089803059895833, "learning_rate": 0.0001, "loss": 4.2833, "loss/crossentropy": 2.2677053213119507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24509359151124954, "step": 7474 }, { "epoch": 0.14952, "grad_norm": 2.046875, "grad_norm_var": 0.007877604166666666, "learning_rate": 0.0001, "loss": 4.1271, "loss/crossentropy": 1.9610475897789001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19867657870054245, "step": 7476 }, { "epoch": 0.14956, "grad_norm": 2.4375, "grad_norm_var": 0.012565104166666667, "learning_rate": 0.0001, "loss": 4.6286, "loss/crossentropy": 2.1379209756851196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24773475527763367, "step": 7478 }, { "epoch": 0.1496, "grad_norm": 2.0625, "grad_norm_var": 0.0125152587890625, "learning_rate": 0.0001, "loss": 4.3593, "loss/crossentropy": 2.295411467552185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2417762354016304, "step": 7480 }, { "epoch": 0.14964, "grad_norm": 2.28125, "grad_norm_var": 0.013374837239583333, "learning_rate": 0.0001, "loss": 4.4839, "loss/crossentropy": 2.0983279943466187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22220823168754578, "step": 7482 }, { "epoch": 0.14968, "grad_norm": 2.34375, "grad_norm_var": 0.015086873372395834, "learning_rate": 0.0001, "loss": 4.3977, "loss/crossentropy": 2.130508065223694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23387905955314636, "step": 7484 }, { "epoch": 0.14972, "grad_norm": 2.265625, "grad_norm_var": 0.015869140625, "learning_rate": 0.0001, "loss": 4.545, "loss/crossentropy": 2.0727924704551697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23198049515485764, "step": 7486 }, { "epoch": 0.14976, "grad_norm": 2.28125, "grad_norm_var": 0.015523274739583334, "learning_rate": 0.0001, "loss": 4.3722, "loss/crossentropy": 2.1407171487808228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23349495232105255, "step": 7488 }, { "epoch": 0.1498, "grad_norm": 2.0625, "grad_norm_var": 0.016063435872395834, "learning_rate": 0.0001, "loss": 4.3483, "loss/crossentropy": 2.123879909515381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2230711579322815, "step": 7490 }, { "epoch": 0.14984, "grad_norm": 1.8984375, "grad_norm_var": 0.02194188435872396, "learning_rate": 0.0001, "loss": 3.8687, "loss/crossentropy": 2.1362847685813904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22217128425836563, "step": 7492 }, { "epoch": 0.14988, "grad_norm": 2.234375, "grad_norm_var": 0.01625544230143229, "learning_rate": 0.0001, "loss": 4.4753, "loss/crossentropy": 2.2171897292137146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.237472265958786, "step": 7494 }, { "epoch": 0.14992, "grad_norm": 2.203125, "grad_norm_var": 0.016841379801432292, "learning_rate": 0.0001, "loss": 4.2439, "loss/crossentropy": 1.9622138142585754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19700734317302704, "step": 7496 }, { "epoch": 0.14996, "grad_norm": 2.078125, "grad_norm_var": 0.01666234334309896, "learning_rate": 0.0001, "loss": 4.1296, "loss/crossentropy": 2.300232410430908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26260019838809967, "step": 7498 }, { "epoch": 0.15, "grad_norm": 2.21875, "grad_norm_var": 0.013765207926432292, "learning_rate": 0.0001, "loss": 4.4466, "loss/crossentropy": 1.9829946756362915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21770135313272476, "step": 7500 }, { "epoch": 0.15004, "grad_norm": 2.109375, "grad_norm_var": 0.010935211181640625, "learning_rate": 0.0001, "loss": 4.017, "loss/crossentropy": 1.8421878218650818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20965785533189774, "step": 7502 }, { "epoch": 0.15008, "grad_norm": 2.203125, "grad_norm_var": 0.010267893473307291, "learning_rate": 0.0001, "loss": 4.3111, "loss/crossentropy": 2.0559566020965576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21752064675092697, "step": 7504 }, { "epoch": 0.15012, "grad_norm": 2.25, "grad_norm_var": 0.009834543863932291, "learning_rate": 0.0001, "loss": 4.5526, "loss/crossentropy": 1.862777590751648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026660442352295, "step": 7506 }, { "epoch": 0.15016, "grad_norm": 2.046875, "grad_norm_var": 0.0064198811848958336, "learning_rate": 0.0001, "loss": 4.0238, "loss/crossentropy": 2.340041399002075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23489895462989807, "step": 7508 }, { "epoch": 0.1502, "grad_norm": 2.21875, "grad_norm_var": 0.006224568684895833, "learning_rate": 0.0001, "loss": 4.5434, "loss/crossentropy": 2.3596150875091553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23775358498096466, "step": 7510 }, { "epoch": 0.15024, "grad_norm": 2.015625, "grad_norm_var": 0.005980428059895833, "learning_rate": 0.0001, "loss": 4.4637, "loss/crossentropy": 1.883503019809723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22191710770130157, "step": 7512 }, { "epoch": 0.15028, "grad_norm": 2.09375, "grad_norm_var": 0.006723785400390625, "learning_rate": 0.0001, "loss": 4.1485, "loss/crossentropy": 2.12862491607666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2177818939089775, "step": 7514 }, { "epoch": 0.15032, "grad_norm": 2.078125, "grad_norm_var": 0.005936431884765625, "learning_rate": 0.0001, "loss": 4.167, "loss/crossentropy": 1.9326343536376953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20991001278162003, "step": 7516 }, { "epoch": 0.15036, "grad_norm": 2.046875, "grad_norm_var": 0.008070627848307291, "learning_rate": 0.0001, "loss": 4.4552, "loss/crossentropy": 2.1532761454582214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2308126464486122, "step": 7518 }, { "epoch": 0.1504, "grad_norm": 2.21875, "grad_norm_var": 0.007867177327473959, "learning_rate": 0.0001, "loss": 4.42, "loss/crossentropy": 1.876515507698059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1940685734152794, "step": 7520 }, { "epoch": 0.15044, "grad_norm": 2.28125, "grad_norm_var": 0.008937327067057292, "learning_rate": 0.0001, "loss": 4.3515, "loss/crossentropy": 2.3677611351013184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24787116795778275, "step": 7522 }, { "epoch": 0.15048, "grad_norm": 2.296875, "grad_norm_var": 0.010892486572265625, "learning_rate": 0.0001, "loss": 4.0489, "loss/crossentropy": 1.8500076532363892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19271107017993927, "step": 7524 }, { "epoch": 0.15052, "grad_norm": 2.171875, "grad_norm_var": 0.010432688395182292, "learning_rate": 0.0001, "loss": 4.3298, "loss/crossentropy": 1.8013980984687805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2181655913591385, "step": 7526 }, { "epoch": 0.15056, "grad_norm": 2.09375, "grad_norm_var": 0.010361480712890624, "learning_rate": 0.0001, "loss": 4.4958, "loss/crossentropy": 2.6469568014144897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2441466525197029, "step": 7528 }, { "epoch": 0.1506, "grad_norm": 2.203125, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.1317, "loss/crossentropy": 1.7992960214614868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2209397256374359, "step": 7530 }, { "epoch": 0.15064, "grad_norm": 2.625, "grad_norm_var": 0.023258463541666666, "learning_rate": 0.0001, "loss": 4.9501, "loss/crossentropy": 2.1914783120155334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23764144629240036, "step": 7532 }, { "epoch": 0.15068, "grad_norm": 2.140625, "grad_norm_var": 0.02906494140625, "learning_rate": 0.0001, "loss": 4.4833, "loss/crossentropy": 2.0035120844841003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21483591943979263, "step": 7534 }, { "epoch": 0.15072, "grad_norm": 1.921875, "grad_norm_var": 0.03235677083333333, "learning_rate": 0.0001, "loss": 4.2674, "loss/crossentropy": 2.0218639969825745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20919294655323029, "step": 7536 }, { "epoch": 0.15076, "grad_norm": 2.015625, "grad_norm_var": 0.03219401041666667, "learning_rate": 0.0001, "loss": 4.1091, "loss/crossentropy": 2.006688416004181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21247616410255432, "step": 7538 }, { "epoch": 0.1508, "grad_norm": 2.484375, "grad_norm_var": 0.03542378743489583, "learning_rate": 0.0001, "loss": 4.5424, "loss/crossentropy": 2.1565613746643066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30109211802482605, "step": 7540 }, { "epoch": 0.15084, "grad_norm": 2.359375, "grad_norm_var": 0.03603413899739583, "learning_rate": 0.0001, "loss": 4.3621, "loss/crossentropy": 1.9676685333251953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23605409264564514, "step": 7542 }, { "epoch": 0.15088, "grad_norm": 2.34375, "grad_norm_var": 0.051493326822916664, "learning_rate": 0.0001, "loss": 4.4178, "loss/crossentropy": 2.1150137186050415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22903620451688766, "step": 7544 }, { "epoch": 0.15092, "grad_norm": 2.34375, "grad_norm_var": 0.051253255208333334, "learning_rate": 0.0001, "loss": 4.3186, "loss/crossentropy": 2.1894554495811462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24362927675247192, "step": 7546 }, { "epoch": 0.15096, "grad_norm": 2.21875, "grad_norm_var": 0.0410308837890625, "learning_rate": 0.0001, "loss": 4.4529, "loss/crossentropy": 1.9624019861221313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22628726810216904, "step": 7548 }, { "epoch": 0.151, "grad_norm": 2.0, "grad_norm_var": 0.04338277180989583, "learning_rate": 0.0001, "loss": 4.2795, "loss/crossentropy": 2.1547625064849854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295984849333763, "step": 7550 }, { "epoch": 0.15104, "grad_norm": 2.125, "grad_norm_var": 0.03857014973958333, "learning_rate": 0.0001, "loss": 4.4638, "loss/crossentropy": 2.229305863380432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2427791878581047, "step": 7552 }, { "epoch": 0.15108, "grad_norm": 2.4375, "grad_norm_var": 0.037093098958333334, "learning_rate": 0.0001, "loss": 4.7319, "loss/crossentropy": 2.4998362064361572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23718395829200745, "step": 7554 }, { "epoch": 0.15112, "grad_norm": 2.15625, "grad_norm_var": 0.033722941080729166, "learning_rate": 0.0001, "loss": 4.2118, "loss/crossentropy": 2.319428563117981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26024487614631653, "step": 7556 }, { "epoch": 0.15116, "grad_norm": 2.015625, "grad_norm_var": 0.0357818603515625, "learning_rate": 0.0001, "loss": 4.2451, "loss/crossentropy": 1.861966609954834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121218591928482, "step": 7558 }, { "epoch": 0.1512, "grad_norm": 2.265625, "grad_norm_var": 0.015751139322916666, "learning_rate": 0.0001, "loss": 4.5963, "loss/crossentropy": 2.1688510179519653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22341617196798325, "step": 7560 }, { "epoch": 0.15124, "grad_norm": 2.140625, "grad_norm_var": 0.013932291666666667, "learning_rate": 0.0001, "loss": 4.4808, "loss/crossentropy": 2.2833406925201416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22648481279611588, "step": 7562 }, { "epoch": 0.15128, "grad_norm": 2.203125, "grad_norm_var": 0.0138671875, "learning_rate": 0.0001, "loss": 4.4126, "loss/crossentropy": 2.224393129348755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23414459079504013, "step": 7564 }, { "epoch": 0.15132, "grad_norm": 2.25, "grad_norm_var": 0.0098297119140625, "learning_rate": 0.0001, "loss": 4.253, "loss/crossentropy": 2.003828763961792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166791632771492, "step": 7566 }, { "epoch": 0.15136, "grad_norm": 2.3125, "grad_norm_var": 0.01168212890625, "learning_rate": 0.0001, "loss": 4.0904, "loss/crossentropy": 2.1813005208969116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28873007744550705, "step": 7568 }, { "epoch": 0.1514, "grad_norm": 1.9921875, "grad_norm_var": 0.028696441650390626, "learning_rate": 0.0001, "loss": 4.3824, "loss/crossentropy": 2.3818061351776123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24916332960128784, "step": 7570 }, { "epoch": 0.15144, "grad_norm": 2.03125, "grad_norm_var": 0.030460357666015625, "learning_rate": 0.0001, "loss": 4.1825, "loss/crossentropy": 2.041518449783325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22352956235408783, "step": 7572 }, { "epoch": 0.15148, "grad_norm": 1.9609375, "grad_norm_var": 0.0326904296875, "learning_rate": 0.0001, "loss": 3.8531, "loss/crossentropy": 1.856759488582611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20227209478616714, "step": 7574 }, { "epoch": 0.15152, "grad_norm": 2.09375, "grad_norm_var": 0.03157450358072917, "learning_rate": 0.0001, "loss": 4.237, "loss/crossentropy": 1.9612281918525696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22188737243413925, "step": 7576 }, { "epoch": 0.15156, "grad_norm": 2.015625, "grad_norm_var": 0.0337554931640625, "learning_rate": 0.0001, "loss": 4.0405, "loss/crossentropy": 1.9610649943351746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2282957062125206, "step": 7578 }, { "epoch": 0.1516, "grad_norm": 2.078125, "grad_norm_var": 0.03400472005208333, "learning_rate": 0.0001, "loss": 4.1253, "loss/crossentropy": 1.9239189624786377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22926658391952515, "step": 7580 }, { "epoch": 0.15164, "grad_norm": 2.171875, "grad_norm_var": 0.033299763997395836, "learning_rate": 0.0001, "loss": 4.548, "loss/crossentropy": 2.425737738609314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24260805547237396, "step": 7582 }, { "epoch": 0.15168, "grad_norm": 2.28125, "grad_norm_var": 0.03242085774739583, "learning_rate": 0.0001, "loss": 4.2894, "loss/crossentropy": 1.9602521061897278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23231150209903717, "step": 7584 }, { "epoch": 0.15172, "grad_norm": 2.03125, "grad_norm_var": 0.007954661051432292, "learning_rate": 0.0001, "loss": 4.3131, "loss/crossentropy": 2.3671375513076782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24326395988464355, "step": 7586 }, { "epoch": 0.15176, "grad_norm": 2.015625, "grad_norm_var": 0.007675933837890625, "learning_rate": 0.0001, "loss": 4.1297, "loss/crossentropy": 2.0128119587898254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20082567632198334, "step": 7588 }, { "epoch": 0.1518, "grad_norm": 2.265625, "grad_norm_var": 0.0085845947265625, "learning_rate": 0.0001, "loss": 4.5813, "loss/crossentropy": 2.3719125986099243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.243422269821167, "step": 7590 }, { "epoch": 0.15184, "grad_norm": 2.28125, "grad_norm_var": 0.010575358072916667, "learning_rate": 0.0001, "loss": 4.6177, "loss/crossentropy": 2.088695764541626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23578013479709625, "step": 7592 }, { "epoch": 0.15188, "grad_norm": 2.203125, "grad_norm_var": 0.01099853515625, "learning_rate": 0.0001, "loss": 4.3719, "loss/crossentropy": 2.151872456073761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2496410757303238, "step": 7594 }, { "epoch": 0.15192, "grad_norm": 2.265625, "grad_norm_var": 0.017281087239583333, "learning_rate": 0.0001, "loss": 4.6041, "loss/crossentropy": 2.020963430404663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25003430247306824, "step": 7596 }, { "epoch": 0.15196, "grad_norm": 2.34375, "grad_norm_var": 0.0199615478515625, "learning_rate": 0.0001, "loss": 4.4439, "loss/crossentropy": 1.9290395379066467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20771078765392303, "step": 7598 }, { "epoch": 0.152, "grad_norm": 2.09375, "grad_norm_var": 0.020536295572916665, "learning_rate": 0.0001, "loss": 4.2461, "loss/crossentropy": 2.0420188307762146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2375432327389717, "step": 7600 }, { "epoch": 0.15204, "grad_norm": 2.21875, "grad_norm_var": 0.019169108072916666, "learning_rate": 0.0001, "loss": 4.4291, "loss/crossentropy": 2.474969744682312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2535504847764969, "step": 7602 }, { "epoch": 0.15208, "grad_norm": 2.3125, "grad_norm_var": 0.015550740559895833, "learning_rate": 0.0001, "loss": 4.3487, "loss/crossentropy": 2.177125334739685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24919381737709045, "step": 7604 }, { "epoch": 0.15212, "grad_norm": 2.3125, "grad_norm_var": 0.015510050455729167, "learning_rate": 0.0001, "loss": 4.4719, "loss/crossentropy": 2.348747491836548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22336618602275848, "step": 7606 }, { "epoch": 0.15216, "grad_norm": 2.28125, "grad_norm_var": 0.014676920572916667, "learning_rate": 0.0001, "loss": 4.4502, "loss/crossentropy": 1.718321442604065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19830843806266785, "step": 7608 }, { "epoch": 0.1522, "grad_norm": 2.171875, "grad_norm_var": 0.015901692708333335, "learning_rate": 0.0001, "loss": 4.4505, "loss/crossentropy": 2.2954800128936768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23203244805335999, "step": 7610 }, { "epoch": 0.15224, "grad_norm": 2.0625, "grad_norm_var": 0.013570149739583334, "learning_rate": 0.0001, "loss": 4.1141, "loss/crossentropy": 1.6918454766273499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18312305957078934, "step": 7612 }, { "epoch": 0.15228, "grad_norm": 2.015625, "grad_norm_var": 0.012727864583333333, "learning_rate": 0.0001, "loss": 4.1312, "loss/crossentropy": 1.887774109840393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151220142841339, "step": 7614 }, { "epoch": 0.15232, "grad_norm": 2.125, "grad_norm_var": 0.01373291015625, "learning_rate": 0.0001, "loss": 4.1274, "loss/crossentropy": 2.0903998613357544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22486191242933273, "step": 7616 }, { "epoch": 0.15236, "grad_norm": 2.0625, "grad_norm_var": 0.016310373942057293, "learning_rate": 0.0001, "loss": 4.0659, "loss/crossentropy": 1.929358720779419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20021560043096542, "step": 7618 }, { "epoch": 0.1524, "grad_norm": 2.703125, "grad_norm_var": 0.03484064737955729, "learning_rate": 0.0001, "loss": 4.5726, "loss/crossentropy": 1.9812661409378052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21395482122898102, "step": 7620 }, { "epoch": 0.15244, "grad_norm": 2.390625, "grad_norm_var": 0.03358942667643229, "learning_rate": 0.0001, "loss": 4.15, "loss/crossentropy": 2.148552179336548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22805052995681763, "step": 7622 }, { "epoch": 0.15248, "grad_norm": 2.078125, "grad_norm_var": 0.03416926066080729, "learning_rate": 0.0001, "loss": 4.3471, "loss/crossentropy": 2.012804687023163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2248745858669281, "step": 7624 }, { "epoch": 0.15252, "grad_norm": 2.046875, "grad_norm_var": 0.03463312784830729, "learning_rate": 0.0001, "loss": 4.3948, "loss/crossentropy": 2.3378156423568726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2457970678806305, "step": 7626 }, { "epoch": 0.15256, "grad_norm": 2.296875, "grad_norm_var": 0.036382802327473956, "learning_rate": 0.0001, "loss": 4.5414, "loss/crossentropy": 2.0815274119377136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22083784639835358, "step": 7628 }, { "epoch": 0.1526, "grad_norm": 1.984375, "grad_norm_var": 0.039589182535807295, "learning_rate": 0.0001, "loss": 4.2609, "loss/crossentropy": 2.172307014465332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25988608598709106, "step": 7630 }, { "epoch": 0.15264, "grad_norm": 1.9921875, "grad_norm_var": 0.03921305338541667, "learning_rate": 0.0001, "loss": 4.3711, "loss/crossentropy": 1.973683476448059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20379862189292908, "step": 7632 }, { "epoch": 0.15268, "grad_norm": 1.921875, "grad_norm_var": 0.039406077067057295, "learning_rate": 0.0001, "loss": 4.3394, "loss/crossentropy": 2.175020456314087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22480874508619308, "step": 7634 }, { "epoch": 0.15272, "grad_norm": 2.1875, "grad_norm_var": 0.022304026285807292, "learning_rate": 0.0001, "loss": 4.2176, "loss/crossentropy": 2.063227415084839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2324352264404297, "step": 7636 }, { "epoch": 0.15276, "grad_norm": 2.296875, "grad_norm_var": 0.019769032796223957, "learning_rate": 0.0001, "loss": 4.4654, "loss/crossentropy": 1.9297555088996887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171614021062851, "step": 7638 }, { "epoch": 0.1528, "grad_norm": 2.296875, "grad_norm_var": 0.019421132405598958, "learning_rate": 0.0001, "loss": 4.4698, "loss/crossentropy": 1.9808542132377625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21765583008527756, "step": 7640 }, { "epoch": 0.15284, "grad_norm": 2.09375, "grad_norm_var": 0.018790435791015626, "learning_rate": 0.0001, "loss": 4.4145, "loss/crossentropy": 2.009860336780548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20963389426469803, "step": 7642 }, { "epoch": 0.15288, "grad_norm": 2.015625, "grad_norm_var": 0.017319488525390624, "learning_rate": 0.0001, "loss": 4.2649, "loss/crossentropy": 1.912703514099121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23796956986188889, "step": 7644 }, { "epoch": 0.15292, "grad_norm": 2.25, "grad_norm_var": 0.012341054280598958, "learning_rate": 0.0001, "loss": 4.4965, "loss/crossentropy": 2.2945642471313477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2621624022722244, "step": 7646 }, { "epoch": 0.15296, "grad_norm": 2.03125, "grad_norm_var": 0.011067708333333334, "learning_rate": 0.0001, "loss": 4.2573, "loss/crossentropy": 2.185902237892151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23890018463134766, "step": 7648 }, { "epoch": 0.153, "grad_norm": 2.546875, "grad_norm_var": 0.018648274739583335, "learning_rate": 0.0001, "loss": 4.5815, "loss/crossentropy": 1.8625503778457642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2206135168671608, "step": 7650 }, { "epoch": 0.15304, "grad_norm": 1.9765625, "grad_norm_var": 0.020684560139973957, "learning_rate": 0.0001, "loss": 3.9142, "loss/crossentropy": 1.8815893530845642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21166741847991943, "step": 7652 }, { "epoch": 0.15308, "grad_norm": 2.203125, "grad_norm_var": 0.020979563395182293, "learning_rate": 0.0001, "loss": 4.2277, "loss/crossentropy": 2.156697630882263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22994951903820038, "step": 7654 }, { "epoch": 0.15312, "grad_norm": 2.25, "grad_norm_var": 0.02005182902018229, "learning_rate": 0.0001, "loss": 4.366, "loss/crossentropy": 1.6800576448440552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21905134618282318, "step": 7656 }, { "epoch": 0.15316, "grad_norm": 2.046875, "grad_norm_var": 0.02061945597330729, "learning_rate": 0.0001, "loss": 4.1446, "loss/crossentropy": 1.9215145707130432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2034405767917633, "step": 7658 }, { "epoch": 0.1532, "grad_norm": 2.390625, "grad_norm_var": 0.02380956013997396, "learning_rate": 0.0001, "loss": 4.6475, "loss/crossentropy": 2.153718650341034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25605182349681854, "step": 7660 }, { "epoch": 0.15324, "grad_norm": 2.203125, "grad_norm_var": 0.02466608683268229, "learning_rate": 0.0001, "loss": 4.2676, "loss/crossentropy": 1.8782889246940613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20574549585580826, "step": 7662 }, { "epoch": 0.15328, "grad_norm": 2.265625, "grad_norm_var": 0.031040191650390625, "learning_rate": 0.0001, "loss": 4.2171, "loss/crossentropy": 2.003354489803314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24152260273694992, "step": 7664 }, { "epoch": 0.15332, "grad_norm": 2.203125, "grad_norm_var": 0.022299957275390626, "learning_rate": 0.0001, "loss": 4.5828, "loss/crossentropy": 2.217758059501648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24700726568698883, "step": 7666 }, { "epoch": 0.15336, "grad_norm": 2.1875, "grad_norm_var": 0.017757161458333334, "learning_rate": 0.0001, "loss": 4.3418, "loss/crossentropy": 1.934537410736084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2163502648472786, "step": 7668 }, { "epoch": 0.1534, "grad_norm": 2.03125, "grad_norm_var": 0.020340983072916666, "learning_rate": 0.0001, "loss": 4.3, "loss/crossentropy": 2.007661819458008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21769292652606964, "step": 7670 }, { "epoch": 0.15344, "grad_norm": 2.09375, "grad_norm_var": 0.020653279622395833, "learning_rate": 0.0001, "loss": 4.3082, "loss/crossentropy": 2.1586949825286865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2453690618276596, "step": 7672 }, { "epoch": 0.15348, "grad_norm": 2.109375, "grad_norm_var": 0.0197174072265625, "learning_rate": 0.0001, "loss": 4.4433, "loss/crossentropy": 2.2201706171035767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22639702260494232, "step": 7674 }, { "epoch": 0.15352, "grad_norm": 2.15625, "grad_norm_var": 0.015104166666666667, "learning_rate": 0.0001, "loss": 4.2337, "loss/crossentropy": 2.059146285057068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22411910444498062, "step": 7676 }, { "epoch": 0.15356, "grad_norm": 2.0625, "grad_norm_var": 0.015397135416666667, "learning_rate": 0.0001, "loss": 4.2318, "loss/crossentropy": 1.9768954515457153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21798508614301682, "step": 7678 }, { "epoch": 0.1536, "grad_norm": 2.109375, "grad_norm_var": 0.003413899739583333, "learning_rate": 0.0001, "loss": 4.4823, "loss/crossentropy": 1.8555094003677368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19453337788581848, "step": 7680 }, { "epoch": 0.15364, "grad_norm": 2.03125, "grad_norm_var": 0.0036783854166666666, "learning_rate": 0.0001, "loss": 4.0379, "loss/crossentropy": 1.5948917865753174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17746463418006897, "step": 7682 }, { "epoch": 0.15368, "grad_norm": 2.1875, "grad_norm_var": 0.0034464518229166668, "learning_rate": 0.0001, "loss": 4.2901, "loss/crossentropy": 1.8917757868766785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2103574424982071, "step": 7684 }, { "epoch": 0.15372, "grad_norm": 2.140625, "grad_norm_var": 0.0030670166015625, "learning_rate": 0.0001, "loss": 4.3326, "loss/crossentropy": 2.090232729911804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2199823334813118, "step": 7686 }, { "epoch": 0.15376, "grad_norm": 2.03125, "grad_norm_var": 0.004979451497395833, "learning_rate": 0.0001, "loss": 4.3695, "loss/crossentropy": 1.8155178427696228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21044857800006866, "step": 7688 }, { "epoch": 0.1538, "grad_norm": 2.09375, "grad_norm_var": 0.0063629150390625, "learning_rate": 0.0001, "loss": 4.5003, "loss/crossentropy": 2.31532621383667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2437051385641098, "step": 7690 }, { "epoch": 0.15384, "grad_norm": 2.078125, "grad_norm_var": 0.0065419514973958336, "learning_rate": 0.0001, "loss": 4.3815, "loss/crossentropy": 1.9688079357147217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22686263918876648, "step": 7692 }, { "epoch": 0.15388, "grad_norm": 2.046875, "grad_norm_var": 0.010001373291015626, "learning_rate": 0.0001, "loss": 3.9937, "loss/crossentropy": 1.9029017686843872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20635761320590973, "step": 7694 }, { "epoch": 0.15392, "grad_norm": 1.9140625, "grad_norm_var": 0.010587565104166667, "learning_rate": 0.0001, "loss": 3.91, "loss/crossentropy": 1.9817028641700745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2080235257744789, "step": 7696 }, { "epoch": 0.15396, "grad_norm": 2.171875, "grad_norm_var": 0.017463175455729167, "learning_rate": 0.0001, "loss": 4.3301, "loss/crossentropy": 2.3392014503479004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26226382702589035, "step": 7698 }, { "epoch": 0.154, "grad_norm": 1.9375, "grad_norm_var": 0.021320597330729166, "learning_rate": 0.0001, "loss": 4.0381, "loss/crossentropy": 1.7265403866767883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21790936589241028, "step": 7700 }, { "epoch": 0.15404, "grad_norm": 2.203125, "grad_norm_var": 0.02276585896809896, "learning_rate": 0.0001, "loss": 4.1725, "loss/crossentropy": 2.024384081363678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21987473219633102, "step": 7702 }, { "epoch": 0.15408, "grad_norm": 2.5, "grad_norm_var": 0.03117650349934896, "learning_rate": 0.0001, "loss": 4.6702, "loss/crossentropy": 2.1840893030166626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24231631308794022, "step": 7704 }, { "epoch": 0.15412, "grad_norm": 2.171875, "grad_norm_var": 0.03001683553059896, "learning_rate": 0.0001, "loss": 4.2197, "loss/crossentropy": 2.1950928568840027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21808429062366486, "step": 7706 }, { "epoch": 0.15416, "grad_norm": 2.140625, "grad_norm_var": 0.03029352823893229, "learning_rate": 0.0001, "loss": 4.3215, "loss/crossentropy": 1.9541537165641785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22109197825193405, "step": 7708 }, { "epoch": 0.1542, "grad_norm": 2.03125, "grad_norm_var": 0.0257476806640625, "learning_rate": 0.0001, "loss": 4.1966, "loss/crossentropy": 2.0232877135276794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.214762382209301, "step": 7710 }, { "epoch": 0.15424, "grad_norm": 1.9609375, "grad_norm_var": 0.023859659830729168, "learning_rate": 0.0001, "loss": 4.0012, "loss/crossentropy": 2.003768503665924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21820105612277985, "step": 7712 }, { "epoch": 0.15428, "grad_norm": 2.21875, "grad_norm_var": 0.020026652018229167, "learning_rate": 0.0001, "loss": 4.2471, "loss/crossentropy": 2.007221221923828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2203196883201599, "step": 7714 }, { "epoch": 0.15432, "grad_norm": 2.09375, "grad_norm_var": 0.015900675455729166, "learning_rate": 0.0001, "loss": 4.3201, "loss/crossentropy": 2.0134615898132324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21916545927524567, "step": 7716 }, { "epoch": 0.15436, "grad_norm": 2.328125, "grad_norm_var": 0.016013336181640626, "learning_rate": 0.0001, "loss": 4.3821, "loss/crossentropy": 1.9012999534606934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25758640468120575, "step": 7718 }, { "epoch": 0.1544, "grad_norm": 2.25, "grad_norm_var": 0.008727773030598959, "learning_rate": 0.0001, "loss": 4.1555, "loss/crossentropy": 2.074169874191284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22667942196130753, "step": 7720 }, { "epoch": 0.15444, "grad_norm": 2.21875, "grad_norm_var": 0.009059397379557292, "learning_rate": 0.0001, "loss": 4.4355, "loss/crossentropy": 2.070925295352936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20353248715400696, "step": 7722 }, { "epoch": 0.15448, "grad_norm": 2.359375, "grad_norm_var": 0.011163075764973959, "learning_rate": 0.0001, "loss": 4.5676, "loss/crossentropy": 2.289568066596985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23175117373466492, "step": 7724 }, { "epoch": 0.15452, "grad_norm": 2.15625, "grad_norm_var": 0.012827301025390625, "learning_rate": 0.0001, "loss": 4.6152, "loss/crossentropy": 2.21374249458313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23585008084774017, "step": 7726 }, { "epoch": 0.15456, "grad_norm": 2.328125, "grad_norm_var": 0.010791015625, "learning_rate": 0.0001, "loss": 4.5575, "loss/crossentropy": 2.15897136926651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24143048375844955, "step": 7728 }, { "epoch": 0.1546, "grad_norm": 2.359375, "grad_norm_var": 0.01142578125, "learning_rate": 0.0001, "loss": 4.4722, "loss/crossentropy": 2.134206771850586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24344487488269806, "step": 7730 }, { "epoch": 0.15464, "grad_norm": 2.359375, "grad_norm_var": 0.011180623372395834, "learning_rate": 0.0001, "loss": 4.5827, "loss/crossentropy": 2.3832513093948364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24878299236297607, "step": 7732 }, { "epoch": 0.15468, "grad_norm": 2.203125, "grad_norm_var": 0.0102203369140625, "learning_rate": 0.0001, "loss": 4.0722, "loss/crossentropy": 1.917544960975647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20134451985359192, "step": 7734 }, { "epoch": 0.15472, "grad_norm": 2.40625, "grad_norm_var": 0.011767578125, "learning_rate": 0.0001, "loss": 4.4499, "loss/crossentropy": 2.1081286668777466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23577219247817993, "step": 7736 }, { "epoch": 0.15476, "grad_norm": 2.4375, "grad_norm_var": 0.0137603759765625, "learning_rate": 0.0001, "loss": 4.7942, "loss/crossentropy": 2.214662790298462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23744845390319824, "step": 7738 }, { "epoch": 0.1548, "grad_norm": 2.359375, "grad_norm_var": 0.013895670572916666, "learning_rate": 0.0001, "loss": 4.4325, "loss/crossentropy": 1.995256781578064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21037640422582626, "step": 7740 }, { "epoch": 0.15484, "grad_norm": 1.890625, "grad_norm_var": 0.024006144205729166, "learning_rate": 0.0001, "loss": 4.0701, "loss/crossentropy": 2.2877765893936157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23090071976184845, "step": 7742 }, { "epoch": 0.15488, "grad_norm": 2.125, "grad_norm_var": 0.022652180989583333, "learning_rate": 0.0001, "loss": 4.6167, "loss/crossentropy": 2.23935329914093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24617131054401398, "step": 7744 }, { "epoch": 0.15492, "grad_norm": 2.125, "grad_norm_var": 0.023005167643229168, "learning_rate": 0.0001, "loss": 4.4886, "loss/crossentropy": 2.15006422996521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2235095053911209, "step": 7746 }, { "epoch": 0.15496, "grad_norm": 2.015625, "grad_norm_var": 0.024144490559895832, "learning_rate": 0.0001, "loss": 4.1873, "loss/crossentropy": 1.9917905926704407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21606986224651337, "step": 7748 }, { "epoch": 0.155, "grad_norm": 2.203125, "grad_norm_var": 0.025804646809895835, "learning_rate": 0.0001, "loss": 4.6347, "loss/crossentropy": 2.303179979324341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2541813999414444, "step": 7750 }, { "epoch": 0.15504, "grad_norm": 2.109375, "grad_norm_var": 0.022786458333333332, "learning_rate": 0.0001, "loss": 4.3647, "loss/crossentropy": 2.231510281562805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2479737550020218, "step": 7752 }, { "epoch": 0.15508, "grad_norm": 2.25, "grad_norm_var": 0.01778132120768229, "learning_rate": 0.0001, "loss": 4.1714, "loss/crossentropy": 2.0530437231063843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20220057666301727, "step": 7754 }, { "epoch": 0.15512, "grad_norm": 2.171875, "grad_norm_var": 0.015909830729166668, "learning_rate": 0.0001, "loss": 4.1978, "loss/crossentropy": 2.0889222025871277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22389977425336838, "step": 7756 }, { "epoch": 0.15516, "grad_norm": 2.28125, "grad_norm_var": 0.0131256103515625, "learning_rate": 0.0001, "loss": 4.4748, "loss/crossentropy": 2.3807711601257324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2397037297487259, "step": 7758 }, { "epoch": 0.1552, "grad_norm": 2.171875, "grad_norm_var": 0.01297607421875, "learning_rate": 0.0001, "loss": 4.3382, "loss/crossentropy": 1.8144067525863647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19052604585886002, "step": 7760 }, { "epoch": 0.15524, "grad_norm": 2.09375, "grad_norm_var": 0.013240559895833334, "learning_rate": 0.0001, "loss": 4.5358, "loss/crossentropy": 2.295349955558777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2168404459953308, "step": 7762 }, { "epoch": 0.15528, "grad_norm": 11.8125, "grad_norm_var": 5.868936920166016, "learning_rate": 0.0001, "loss": 4.1706, "loss/crossentropy": 1.7281805276870728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.230300635099411, "step": 7764 }, { "epoch": 0.15532, "grad_norm": 2.375, "grad_norm_var": 5.861083730061849, "learning_rate": 0.0001, "loss": 4.3867, "loss/crossentropy": 2.2153135538101196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24419061839580536, "step": 7766 }, { "epoch": 0.15536, "grad_norm": 2.03125, "grad_norm_var": 5.890169270833334, "learning_rate": 0.0001, "loss": 4.2047, "loss/crossentropy": 2.0259060859680176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2119704708456993, "step": 7768 }, { "epoch": 0.1554, "grad_norm": 2.21875, "grad_norm_var": 5.871726226806641, "learning_rate": 0.0001, "loss": 4.3405, "loss/crossentropy": 2.2399297952651978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23252833634614944, "step": 7770 }, { "epoch": 0.15544, "grad_norm": 2.109375, "grad_norm_var": 5.86380615234375, "learning_rate": 0.0001, "loss": 4.3287, "loss/crossentropy": 2.0974661111831665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2085232511162758, "step": 7772 }, { "epoch": 0.15548, "grad_norm": 2.21875, "grad_norm_var": 5.861717732747396, "learning_rate": 0.0001, "loss": 4.4141, "loss/crossentropy": 2.0121108293533325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24354346096515656, "step": 7774 }, { "epoch": 0.15552, "grad_norm": 2.171875, "grad_norm_var": 5.843431599934896, "learning_rate": 0.0001, "loss": 4.3167, "loss/crossentropy": 2.1463273763656616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22797952592372894, "step": 7776 }, { "epoch": 0.15556, "grad_norm": 1.9140625, "grad_norm_var": 5.875705718994141, "learning_rate": 0.0001, "loss": 3.9814, "loss/crossentropy": 1.6362827122211456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18291430547833443, "step": 7778 }, { "epoch": 0.1556, "grad_norm": 2.21875, "grad_norm_var": 0.016283162434895835, "learning_rate": 0.0001, "loss": 4.3534, "loss/crossentropy": 2.4894620180130005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24631594866514206, "step": 7780 }, { "epoch": 0.15564, "grad_norm": 2.15625, "grad_norm_var": 0.0119781494140625, "learning_rate": 0.0001, "loss": 4.5799, "loss/crossentropy": 2.346967577934265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20964853465557098, "step": 7782 }, { "epoch": 0.15568, "grad_norm": 2.03125, "grad_norm_var": 0.010282135009765625, "learning_rate": 0.0001, "loss": 4.5036, "loss/crossentropy": 2.0165189504623413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22410035878419876, "step": 7784 }, { "epoch": 0.15572, "grad_norm": 2.140625, "grad_norm_var": 0.009417470296223958, "learning_rate": 0.0001, "loss": 4.0735, "loss/crossentropy": 1.7486848831176758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20792805403470993, "step": 7786 }, { "epoch": 0.15576, "grad_norm": 2.03125, "grad_norm_var": 0.009905751546223958, "learning_rate": 0.0001, "loss": 4.1321, "loss/crossentropy": 1.9615037441253662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22578728944063187, "step": 7788 }, { "epoch": 0.1558, "grad_norm": 2.09375, "grad_norm_var": 0.009069569905598958, "learning_rate": 0.0001, "loss": 4.326, "loss/crossentropy": 1.8386783003807068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21898606419563293, "step": 7790 }, { "epoch": 0.15584, "grad_norm": 2.15625, "grad_norm_var": 0.011533355712890625, "learning_rate": 0.0001, "loss": 4.6429, "loss/crossentropy": 2.1383039951324463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2322411835193634, "step": 7792 }, { "epoch": 0.15588, "grad_norm": 2.21875, "grad_norm_var": 0.008918253580729167, "learning_rate": 0.0001, "loss": 4.3568, "loss/crossentropy": 2.510488271713257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24672146886587143, "step": 7794 }, { "epoch": 0.15592, "grad_norm": 2.125, "grad_norm_var": 0.009309895833333333, "learning_rate": 0.0001, "loss": 4.4328, "loss/crossentropy": 2.03993421792984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23064473271369934, "step": 7796 }, { "epoch": 0.15596, "grad_norm": 2.09375, "grad_norm_var": 0.007306925455729167, "learning_rate": 0.0001, "loss": 4.4833, "loss/crossentropy": 2.305809736251831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21916814893484116, "step": 7798 }, { "epoch": 0.156, "grad_norm": 2.03125, "grad_norm_var": 0.007796223958333333, "learning_rate": 0.0001, "loss": 4.1218, "loss/crossentropy": 1.8330454230308533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20832987129688263, "step": 7800 }, { "epoch": 0.15604, "grad_norm": 2.140625, "grad_norm_var": 0.007420857747395833, "learning_rate": 0.0001, "loss": 4.2579, "loss/crossentropy": 1.9194663166999817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22163349390029907, "step": 7802 }, { "epoch": 0.15608, "grad_norm": 2.078125, "grad_norm_var": 0.006917317708333333, "learning_rate": 0.0001, "loss": 4.2219, "loss/crossentropy": 1.798878252506256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19532181322574615, "step": 7804 }, { "epoch": 0.15612, "grad_norm": 2.140625, "grad_norm_var": 0.0067942301432291664, "learning_rate": 0.0001, "loss": 4.2949, "loss/crossentropy": 1.730432152748108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20092248916625977, "step": 7806 }, { "epoch": 0.15616, "grad_norm": 2.515625, "grad_norm_var": 0.01304931640625, "learning_rate": 0.0001, "loss": 4.3109, "loss/crossentropy": 2.1426968574523926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22185371816158295, "step": 7808 }, { "epoch": 0.1562, "grad_norm": 2.078125, "grad_norm_var": 0.012848917643229167, "learning_rate": 0.0001, "loss": 4.4308, "loss/crossentropy": 1.983969271183014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22203146666288376, "step": 7810 }, { "epoch": 0.15624, "grad_norm": 2.0625, "grad_norm_var": 0.016532389322916667, "learning_rate": 0.0001, "loss": 4.2114, "loss/crossentropy": 2.2948192954063416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23453453928232193, "step": 7812 }, { "epoch": 0.15628, "grad_norm": 2.09375, "grad_norm_var": 0.0175933837890625, "learning_rate": 0.0001, "loss": 4.287, "loss/crossentropy": 1.9190048575401306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22457829862833023, "step": 7814 }, { "epoch": 0.15632, "grad_norm": 2.171875, "grad_norm_var": 0.01529541015625, "learning_rate": 0.0001, "loss": 4.1721, "loss/crossentropy": 2.0268847346305847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21628276258707047, "step": 7816 }, { "epoch": 0.15636, "grad_norm": 2.203125, "grad_norm_var": 0.017411295572916666, "learning_rate": 0.0001, "loss": 4.3553, "loss/crossentropy": 2.0050706267356873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20108875632286072, "step": 7818 }, { "epoch": 0.1564, "grad_norm": 2.453125, "grad_norm_var": 0.024470774332682292, "learning_rate": 0.0001, "loss": 4.2059, "loss/crossentropy": 2.096635937690735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22623063623905182, "step": 7820 }, { "epoch": 0.15644, "grad_norm": 2.21875, "grad_norm_var": 0.024580637613932293, "learning_rate": 0.0001, "loss": 4.3901, "loss/crossentropy": 1.9188768863677979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21088901162147522, "step": 7822 }, { "epoch": 0.15648, "grad_norm": 2.203125, "grad_norm_var": 0.018304189046223957, "learning_rate": 0.0001, "loss": 4.4062, "loss/crossentropy": 2.3639817237854004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2608294039964676, "step": 7824 }, { "epoch": 0.15652, "grad_norm": 2.1875, "grad_norm_var": 0.017765045166015625, "learning_rate": 0.0001, "loss": 4.2123, "loss/crossentropy": 1.9716956615447998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23327408730983734, "step": 7826 }, { "epoch": 0.15656, "grad_norm": 2.265625, "grad_norm_var": 0.08131688435872396, "learning_rate": 0.0001, "loss": 4.1377, "loss/crossentropy": 2.004276990890503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22930586338043213, "step": 7828 }, { "epoch": 0.1566, "grad_norm": 2.078125, "grad_norm_var": 0.08247858683268229, "learning_rate": 0.0001, "loss": 4.4703, "loss/crossentropy": 1.7996181845664978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21030305325984955, "step": 7830 }, { "epoch": 0.15664, "grad_norm": 2.34375, "grad_norm_var": 0.09555435180664062, "learning_rate": 0.0001, "loss": 4.7938, "loss/crossentropy": 2.192178189754486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232302725315094, "step": 7832 }, { "epoch": 0.15668, "grad_norm": 2.109375, "grad_norm_var": 0.09538345336914063, "learning_rate": 0.0001, "loss": 4.2381, "loss/crossentropy": 1.7093925476074219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19697313755750656, "step": 7834 }, { "epoch": 0.15672, "grad_norm": 2.234375, "grad_norm_var": 0.09130859375, "learning_rate": 0.0001, "loss": 4.2534, "loss/crossentropy": 1.915247917175293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21003933250904083, "step": 7836 }, { "epoch": 0.15676, "grad_norm": 2.375, "grad_norm_var": 0.091162109375, "learning_rate": 0.0001, "loss": 4.3825, "loss/crossentropy": 2.188641667366028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23260314762592316, "step": 7838 }, { "epoch": 0.1568, "grad_norm": 2.109375, "grad_norm_var": 0.09501953125, "learning_rate": 0.0001, "loss": 4.7329, "loss/crossentropy": 2.3316495418548584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24601806700229645, "step": 7840 }, { "epoch": 0.15684, "grad_norm": 2.28125, "grad_norm_var": 0.0918121337890625, "learning_rate": 0.0001, "loss": 4.2934, "loss/crossentropy": 2.140946924686432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23532958328723907, "step": 7842 }, { "epoch": 0.15688, "grad_norm": 2.21875, "grad_norm_var": 0.031525675455729166, "learning_rate": 0.0001, "loss": 4.2738, "loss/crossentropy": 2.372095465660095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2615740895271301, "step": 7844 }, { "epoch": 0.15692, "grad_norm": 2.25, "grad_norm_var": 0.028727213541666668, "learning_rate": 0.0001, "loss": 4.1954, "loss/crossentropy": 1.8433185815811157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160269021987915, "step": 7846 }, { "epoch": 0.15696, "grad_norm": 2.140625, "grad_norm_var": 0.016071573893229166, "learning_rate": 0.0001, "loss": 4.2489, "loss/crossentropy": 2.012324333190918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1964300200343132, "step": 7848 }, { "epoch": 0.157, "grad_norm": 2.203125, "grad_norm_var": 0.014090983072916667, "learning_rate": 0.0001, "loss": 4.486, "loss/crossentropy": 2.0325884222984314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22080926597118378, "step": 7850 }, { "epoch": 0.15704, "grad_norm": 2.296875, "grad_norm_var": 0.013036092122395834, "learning_rate": 0.0001, "loss": 4.3784, "loss/crossentropy": 2.3786104917526245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24448946118354797, "step": 7852 }, { "epoch": 0.15708, "grad_norm": 2.21875, "grad_norm_var": 0.011693318684895834, "learning_rate": 0.0001, "loss": 4.5173, "loss/crossentropy": 2.304913640022278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24708375334739685, "step": 7854 }, { "epoch": 0.15712, "grad_norm": 2.1875, "grad_norm_var": 0.006371053059895834, "learning_rate": 0.0001, "loss": 4.5711, "loss/crossentropy": 1.8640215396881104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19824489206075668, "step": 7856 }, { "epoch": 0.15716, "grad_norm": 2.0625, "grad_norm_var": 0.00625, "learning_rate": 0.0001, "loss": 4.6174, "loss/crossentropy": 2.444548487663269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2542492523789406, "step": 7858 }, { "epoch": 0.1572, "grad_norm": 2.078125, "grad_norm_var": 0.00592041015625, "learning_rate": 0.0001, "loss": 4.0522, "loss/crossentropy": 1.991346299648285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159758359193802, "step": 7860 }, { "epoch": 0.15724, "grad_norm": 2.15625, "grad_norm_var": 0.005101521809895833, "learning_rate": 0.0001, "loss": 4.2627, "loss/crossentropy": 2.3172048926353455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2506742626428604, "step": 7862 }, { "epoch": 0.15728, "grad_norm": 2.265625, "grad_norm_var": 0.0050201416015625, "learning_rate": 0.0001, "loss": 4.5415, "loss/crossentropy": 1.9302632212638855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104542776942253, "step": 7864 }, { "epoch": 0.15732, "grad_norm": 2.125, "grad_norm_var": 0.020726521809895832, "learning_rate": 0.0001, "loss": 4.7831, "loss/crossentropy": 2.457883358001709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23538483679294586, "step": 7866 }, { "epoch": 0.15736, "grad_norm": 2.125, "grad_norm_var": 0.019677734375, "learning_rate": 0.0001, "loss": 4.1912, "loss/crossentropy": 1.9857566952705383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2131117358803749, "step": 7868 }, { "epoch": 0.1574, "grad_norm": 2.203125, "grad_norm_var": 0.019140625, "learning_rate": 0.0001, "loss": 4.4264, "loss/crossentropy": 2.0062127113342285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22806233912706375, "step": 7870 }, { "epoch": 0.15744, "grad_norm": 2.15625, "grad_norm_var": 0.019489542643229166, "learning_rate": 0.0001, "loss": 4.6181, "loss/crossentropy": 2.283127784729004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23883548378944397, "step": 7872 }, { "epoch": 0.15748, "grad_norm": 2.234375, "grad_norm_var": 0.019136555989583335, "learning_rate": 0.0001, "loss": 4.4442, "loss/crossentropy": 2.110979437828064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.209333136677742, "step": 7874 }, { "epoch": 0.15752, "grad_norm": 2.140625, "grad_norm_var": 0.019856770833333332, "learning_rate": 0.0001, "loss": 4.2107, "loss/crossentropy": 1.8705166578292847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20791998505592346, "step": 7876 }, { "epoch": 0.15756, "grad_norm": 2.140625, "grad_norm_var": 0.019017537434895832, "learning_rate": 0.0001, "loss": 4.2956, "loss/crossentropy": 2.133803129196167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23219672590494156, "step": 7878 }, { "epoch": 0.1576, "grad_norm": 2.21875, "grad_norm_var": 0.019627888997395832, "learning_rate": 0.0001, "loss": 4.5028, "loss/crossentropy": 2.1062549352645874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2266012579202652, "step": 7880 }, { "epoch": 0.15764, "grad_norm": 2.28125, "grad_norm_var": 0.0058553059895833336, "learning_rate": 0.0001, "loss": 4.5078, "loss/crossentropy": 2.0088155269622803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2079932913184166, "step": 7882 }, { "epoch": 0.15768, "grad_norm": 2.234375, "grad_norm_var": 0.0065582275390625, "learning_rate": 0.0001, "loss": 3.9204, "loss/crossentropy": 1.6621176600456238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19370710104703903, "step": 7884 }, { "epoch": 0.15772, "grad_norm": 2.09375, "grad_norm_var": 0.007136027018229167, "learning_rate": 0.0001, "loss": 4.3462, "loss/crossentropy": 1.7729167938232422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18562481552362442, "step": 7886 }, { "epoch": 0.15776, "grad_norm": 2.109375, "grad_norm_var": 0.006843058268229166, "learning_rate": 0.0001, "loss": 4.3897, "loss/crossentropy": 2.132485508918762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23432063311338425, "step": 7888 }, { "epoch": 0.1578, "grad_norm": 2.328125, "grad_norm_var": 0.0091949462890625, "learning_rate": 0.0001, "loss": 4.3731, "loss/crossentropy": 2.122144937515259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23042195290327072, "step": 7890 }, { "epoch": 0.15784, "grad_norm": 2.8125, "grad_norm_var": 0.034956868489583334, "learning_rate": 0.0001, "loss": 4.4025, "loss/crossentropy": 1.855578601360321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20124691724777222, "step": 7892 }, { "epoch": 0.15788, "grad_norm": 2.15625, "grad_norm_var": 0.035054524739583336, "learning_rate": 0.0001, "loss": 4.172, "loss/crossentropy": 2.02128005027771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20766886323690414, "step": 7894 }, { "epoch": 0.15792, "grad_norm": 1.984375, "grad_norm_var": 0.03681233723958333, "learning_rate": 0.0001, "loss": 4.3096, "loss/crossentropy": 2.112824857234955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22250613570213318, "step": 7896 }, { "epoch": 0.15796, "grad_norm": 2.3125, "grad_norm_var": 0.03662821451822917, "learning_rate": 0.0001, "loss": 4.5595, "loss/crossentropy": 2.2290207147598267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2531846910715103, "step": 7898 }, { "epoch": 0.158, "grad_norm": 1.96875, "grad_norm_var": 0.03882548014322917, "learning_rate": 0.0001, "loss": 4.3586, "loss/crossentropy": 2.135373592376709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2266940325498581, "step": 7900 }, { "epoch": 0.15804, "grad_norm": 2.125, "grad_norm_var": 0.04052734375, "learning_rate": 0.0001, "loss": 4.1719, "loss/crossentropy": 1.7298616170883179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1986929401755333, "step": 7902 }, { "epoch": 0.15808, "grad_norm": 2.3125, "grad_norm_var": 0.04173075358072917, "learning_rate": 0.0001, "loss": 4.403, "loss/crossentropy": 2.18759286403656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21622422337532043, "step": 7904 }, { "epoch": 0.15812, "grad_norm": 2.140625, "grad_norm_var": 0.03889058430989583, "learning_rate": 0.0001, "loss": 4.4765, "loss/crossentropy": 2.0889216661453247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21440055221319199, "step": 7906 }, { "epoch": 0.15816, "grad_norm": 2.1875, "grad_norm_var": 0.010204060872395834, "learning_rate": 0.0001, "loss": 4.2231, "loss/crossentropy": 1.7791658639907837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1967781037092209, "step": 7908 }, { "epoch": 0.1582, "grad_norm": 1.953125, "grad_norm_var": 0.012386067708333334, "learning_rate": 0.0001, "loss": 4.2314, "loss/crossentropy": 2.2144845724105835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22152486443519592, "step": 7910 }, { "epoch": 0.15824, "grad_norm": 2.078125, "grad_norm_var": 0.012723795572916667, "learning_rate": 0.0001, "loss": 4.6236, "loss/crossentropy": 2.316117286682129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24549901485443115, "step": 7912 }, { "epoch": 0.15828, "grad_norm": 2.046875, "grad_norm_var": 0.0115875244140625, "learning_rate": 0.0001, "loss": 4.0867, "loss/crossentropy": 1.8642511367797852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22441796958446503, "step": 7914 }, { "epoch": 0.15832, "grad_norm": 2.078125, "grad_norm_var": 0.0098541259765625, "learning_rate": 0.0001, "loss": 4.1365, "loss/crossentropy": 1.807969868183136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22138798981904984, "step": 7916 }, { "epoch": 0.15836, "grad_norm": 2.390625, "grad_norm_var": 0.013866170247395834, "learning_rate": 0.0001, "loss": 4.5625, "loss/crossentropy": 2.266697645187378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22544697672128677, "step": 7918 }, { "epoch": 0.1584, "grad_norm": 2.203125, "grad_norm_var": 0.012262980143229166, "learning_rate": 0.0001, "loss": 4.5632, "loss/crossentropy": 2.0871587991714478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21997228264808655, "step": 7920 }, { "epoch": 0.15844, "grad_norm": 2.21875, "grad_norm_var": 0.0126861572265625, "learning_rate": 0.0001, "loss": 4.5567, "loss/crossentropy": 2.1993759870529175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2416691780090332, "step": 7922 }, { "epoch": 0.15848, "grad_norm": 2.5, "grad_norm_var": 0.020340983072916666, "learning_rate": 0.0001, "loss": 4.2997, "loss/crossentropy": 1.8039653897285461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21721573173999786, "step": 7924 }, { "epoch": 0.15852, "grad_norm": 2.3125, "grad_norm_var": 0.0225982666015625, "learning_rate": 0.0001, "loss": 4.4795, "loss/crossentropy": 2.188117265701294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.231883242726326, "step": 7926 }, { "epoch": 0.15856, "grad_norm": 2.171875, "grad_norm_var": 0.021923828125, "learning_rate": 0.0001, "loss": 4.6694, "loss/crossentropy": 2.3920425176620483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24655026197433472, "step": 7928 }, { "epoch": 0.1586, "grad_norm": 2.125, "grad_norm_var": 0.017561848958333334, "learning_rate": 0.0001, "loss": 4.1001, "loss/crossentropy": 2.286831498146057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23398208618164062, "step": 7930 }, { "epoch": 0.15864, "grad_norm": 2.0625, "grad_norm_var": 0.016825358072916668, "learning_rate": 0.0001, "loss": 4.0007, "loss/crossentropy": 2.075824797153473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22423933446407318, "step": 7932 }, { "epoch": 0.15868, "grad_norm": 2.25, "grad_norm_var": 0.0155426025390625, "learning_rate": 0.0001, "loss": 4.5697, "loss/crossentropy": 2.197165012359619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22215355187654495, "step": 7934 }, { "epoch": 0.15872, "grad_norm": 2.078125, "grad_norm_var": 0.017854817708333335, "learning_rate": 0.0001, "loss": 4.2899, "loss/crossentropy": 1.8253535032272339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20457974076271057, "step": 7936 }, { "epoch": 0.15876, "grad_norm": 2.21875, "grad_norm_var": 0.01783447265625, "learning_rate": 0.0001, "loss": 4.4774, "loss/crossentropy": 1.842383086681366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20742817968130112, "step": 7938 }, { "epoch": 0.1588, "grad_norm": 2.0625, "grad_norm_var": 0.014208984375, "learning_rate": 0.0001, "loss": 4.3801, "loss/crossentropy": 2.2086315155029297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2086438685655594, "step": 7940 }, { "epoch": 0.15884, "grad_norm": 2.109375, "grad_norm_var": 0.0067047119140625, "learning_rate": 0.0001, "loss": 4.3718, "loss/crossentropy": 2.3381282091140747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2546956539154053, "step": 7942 }, { "epoch": 0.15888, "grad_norm": 2.265625, "grad_norm_var": 0.005248006184895833, "learning_rate": 0.0001, "loss": 4.3439, "loss/crossentropy": 2.1159931421279907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23917824029922485, "step": 7944 }, { "epoch": 0.15892, "grad_norm": 2.078125, "grad_norm_var": 0.0069081624348958336, "learning_rate": 0.0001, "loss": 4.2056, "loss/crossentropy": 1.7507159113883972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1870383694767952, "step": 7946 }, { "epoch": 0.15896, "grad_norm": 2.140625, "grad_norm_var": 0.0073964436848958336, "learning_rate": 0.0001, "loss": 4.2118, "loss/crossentropy": 2.213807225227356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22211921215057373, "step": 7948 }, { "epoch": 0.159, "grad_norm": 2.03125, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.4316, "loss/crossentropy": 2.0570366978645325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2662791311740875, "step": 7950 }, { "epoch": 0.15904, "grad_norm": 2.171875, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 4.2041, "loss/crossentropy": 1.8775206208229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22045740485191345, "step": 7952 }, { "epoch": 0.15908, "grad_norm": 2.046875, "grad_norm_var": 0.009601847330729166, "learning_rate": 0.0001, "loss": 4.2575, "loss/crossentropy": 2.3483108282089233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22626785188913345, "step": 7954 }, { "epoch": 0.15912, "grad_norm": 2.125, "grad_norm_var": 0.009496053059895834, "learning_rate": 0.0001, "loss": 4.193, "loss/crossentropy": 1.9531084895133972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22194421291351318, "step": 7956 }, { "epoch": 0.15916, "grad_norm": 2.328125, "grad_norm_var": 0.011213175455729167, "learning_rate": 0.0001, "loss": 4.4819, "loss/crossentropy": 2.057813823223114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21116723865270615, "step": 7958 }, { "epoch": 0.1592, "grad_norm": 2.28125, "grad_norm_var": 0.01148681640625, "learning_rate": 0.0001, "loss": 4.3478, "loss/crossentropy": 1.9398415088653564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2257898524403572, "step": 7960 }, { "epoch": 0.15924, "grad_norm": 2.015625, "grad_norm_var": 0.0111480712890625, "learning_rate": 0.0001, "loss": 4.1942, "loss/crossentropy": 1.9200270175933838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2119666188955307, "step": 7962 }, { "epoch": 0.15928, "grad_norm": 2.203125, "grad_norm_var": 0.010640462239583334, "learning_rate": 0.0001, "loss": 4.4513, "loss/crossentropy": 2.157149076461792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23271384835243225, "step": 7964 }, { "epoch": 0.15932, "grad_norm": 2.25, "grad_norm_var": 0.0083404541015625, "learning_rate": 0.0001, "loss": 4.2064, "loss/crossentropy": 2.0308582186698914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23934553563594818, "step": 7966 }, { "epoch": 0.15936, "grad_norm": 2.234375, "grad_norm_var": 0.015543619791666666, "learning_rate": 0.0001, "loss": 4.3977, "loss/crossentropy": 2.1855711936950684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.236568883061409, "step": 7968 }, { "epoch": 0.1594, "grad_norm": 2.375, "grad_norm_var": 0.07629292805989583, "learning_rate": 0.0001, "loss": 4.719, "loss/crossentropy": 2.4479551315307617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24693088978528976, "step": 7970 }, { "epoch": 0.15944, "grad_norm": 2.21875, "grad_norm_var": 0.07333882649739583, "learning_rate": 0.0001, "loss": 4.5221, "loss/crossentropy": 2.439974784851074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2502682954072952, "step": 7972 }, { "epoch": 0.15948, "grad_norm": 2.375, "grad_norm_var": 0.07625325520833333, "learning_rate": 0.0001, "loss": 4.4024, "loss/crossentropy": 1.9899010062217712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23031124472618103, "step": 7974 }, { "epoch": 0.15952, "grad_norm": 2.109375, "grad_norm_var": 0.07517903645833333, "learning_rate": 0.0001, "loss": 4.2522, "loss/crossentropy": 1.830255150794983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984459012746811, "step": 7976 }, { "epoch": 0.15956, "grad_norm": 2.21875, "grad_norm_var": 0.07088114420572916, "learning_rate": 0.0001, "loss": 4.3832, "loss/crossentropy": 1.9675705432891846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21336429566144943, "step": 7978 }, { "epoch": 0.1596, "grad_norm": 2.1875, "grad_norm_var": 0.06965738932291667, "learning_rate": 0.0001, "loss": 4.4784, "loss/crossentropy": 2.030815005302429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102503478527069, "step": 7980 }, { "epoch": 0.15964, "grad_norm": 2.109375, "grad_norm_var": 0.07285054524739583, "learning_rate": 0.0001, "loss": 4.492, "loss/crossentropy": 2.4932440519332886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.246867336332798, "step": 7982 }, { "epoch": 0.15968, "grad_norm": 2.078125, "grad_norm_var": 0.07330322265625, "learning_rate": 0.0001, "loss": 4.2085, "loss/crossentropy": 1.5839802622795105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17231802642345428, "step": 7984 }, { "epoch": 0.15972, "grad_norm": 2.359375, "grad_norm_var": 0.0091217041015625, "learning_rate": 0.0001, "loss": 4.4045, "loss/crossentropy": 1.821477472782135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239791452884674, "step": 7986 }, { "epoch": 0.15976, "grad_norm": 2.109375, "grad_norm_var": 0.0093414306640625, "learning_rate": 0.0001, "loss": 4.0655, "loss/crossentropy": 2.013838052749634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2335103377699852, "step": 7988 }, { "epoch": 0.1598, "grad_norm": 2.03125, "grad_norm_var": 0.006376139322916667, "learning_rate": 0.0001, "loss": 4.4989, "loss/crossentropy": 1.9412779211997986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20336110144853592, "step": 7990 }, { "epoch": 0.15984, "grad_norm": 2.0, "grad_norm_var": 0.008234659830729166, "learning_rate": 0.0001, "loss": 3.9015, "loss/crossentropy": 1.6653677225112915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025909200310707, "step": 7992 }, { "epoch": 0.15988, "grad_norm": 2.125, "grad_norm_var": 0.00865478515625, "learning_rate": 0.0001, "loss": 4.4299, "loss/crossentropy": 2.0069726705551147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21300900727510452, "step": 7994 }, { "epoch": 0.15992, "grad_norm": 2.046875, "grad_norm_var": 0.009919230143229167, "learning_rate": 0.0001, "loss": 4.4395, "loss/crossentropy": 2.1118472814559937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25853876769542694, "step": 7996 }, { "epoch": 0.15996, "grad_norm": 2.34375, "grad_norm_var": 0.013622029622395834, "learning_rate": 0.0001, "loss": 4.1922, "loss/crossentropy": 1.608262836933136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958206593990326, "step": 7998 }, { "epoch": 0.16, "grad_norm": 2.203125, "grad_norm_var": 0.0138580322265625, "learning_rate": 0.0001, "loss": 4.2896, "loss/crossentropy": 1.6572073101997375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19102878868579865, "step": 8000 }, { "epoch": 0.16004, "grad_norm": 2.015625, "grad_norm_var": 0.0113677978515625, "learning_rate": 0.0001, "loss": 4.0811, "loss/crossentropy": 1.9421688318252563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19628942012786865, "step": 8002 }, { "epoch": 0.16008, "grad_norm": 2.265625, "grad_norm_var": 0.0134918212890625, "learning_rate": 0.0001, "loss": 4.5344, "loss/crossentropy": 2.2197489738464355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.248014435172081, "step": 8004 }, { "epoch": 0.16012, "grad_norm": 2.203125, "grad_norm_var": 0.0126617431640625, "learning_rate": 0.0001, "loss": 4.4625, "loss/crossentropy": 2.200868308544159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2476908192038536, "step": 8006 }, { "epoch": 0.16016, "grad_norm": 2.34375, "grad_norm_var": 0.01162109375, "learning_rate": 0.0001, "loss": 4.8151, "loss/crossentropy": 2.3793649673461914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356039509177208, "step": 8008 }, { "epoch": 0.1602, "grad_norm": 2.140625, "grad_norm_var": 0.0112945556640625, "learning_rate": 0.0001, "loss": 4.1849, "loss/crossentropy": 2.130257308483124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224358968436718, "step": 8010 }, { "epoch": 0.16024, "grad_norm": 2.15625, "grad_norm_var": 0.009566243489583333, "learning_rate": 0.0001, "loss": 4.5948, "loss/crossentropy": 2.370365023612976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23811528086662292, "step": 8012 }, { "epoch": 0.16028, "grad_norm": 2.0, "grad_norm_var": 0.008812459309895833, "learning_rate": 0.0001, "loss": 4.4326, "loss/crossentropy": 1.985486626625061, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19639353454113007, "step": 8014 }, { "epoch": 0.16032, "grad_norm": 2.40625, "grad_norm_var": 0.01129150390625, "learning_rate": 0.0001, "loss": 4.4335, "loss/crossentropy": 2.2128632068634033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2340361252427101, "step": 8016 }, { "epoch": 0.16036, "grad_norm": 2.0625, "grad_norm_var": 0.009956868489583333, "learning_rate": 0.0001, "loss": 4.3688, "loss/crossentropy": 1.830498456954956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19760622829198837, "step": 8018 }, { "epoch": 0.1604, "grad_norm": 4.375, "grad_norm_var": 0.3061757405598958, "learning_rate": 0.0001, "loss": 4.6443, "loss/crossentropy": 1.9595977067947388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138657420873642, "step": 8020 }, { "epoch": 0.16044, "grad_norm": 2.1875, "grad_norm_var": 0.30684305826822916, "learning_rate": 0.0001, "loss": 4.3101, "loss/crossentropy": 2.400893449783325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2615668326616287, "step": 8022 }, { "epoch": 0.16048, "grad_norm": 2.140625, "grad_norm_var": 0.3080963134765625, "learning_rate": 0.0001, "loss": 4.6359, "loss/crossentropy": 2.5079843997955322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.251235693693161, "step": 8024 }, { "epoch": 0.16052, "grad_norm": 2.265625, "grad_norm_var": 0.30686848958333335, "learning_rate": 0.0001, "loss": 4.4973, "loss/crossentropy": 2.573891043663025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2646481841802597, "step": 8026 }, { "epoch": 0.16056, "grad_norm": 2.1875, "grad_norm_var": 0.3064605712890625, "learning_rate": 0.0001, "loss": 4.4177, "loss/crossentropy": 2.0088363885879517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22888045758008957, "step": 8028 }, { "epoch": 0.1606, "grad_norm": 2.109375, "grad_norm_var": 0.31115697224934896, "learning_rate": 0.0001, "loss": 4.214, "loss/crossentropy": 2.4596647024154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2507014721632004, "step": 8030 }, { "epoch": 0.16064, "grad_norm": 2.1875, "grad_norm_var": 0.3093462626139323, "learning_rate": 0.0001, "loss": 4.6826, "loss/crossentropy": 2.204255223274231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22913406044244766, "step": 8032 }, { "epoch": 0.16068, "grad_norm": 2.125, "grad_norm_var": 0.30677261352539065, "learning_rate": 0.0001, "loss": 4.6603, "loss/crossentropy": 2.285408139228821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23185817897319794, "step": 8034 }, { "epoch": 0.16072, "grad_norm": 1.9609375, "grad_norm_var": 0.012520345052083333, "learning_rate": 0.0001, "loss": 3.9513, "loss/crossentropy": 2.0788660645484924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20811481028795242, "step": 8036 }, { "epoch": 0.16076, "grad_norm": 2.140625, "grad_norm_var": 0.01307373046875, "learning_rate": 0.0001, "loss": 4.5368, "loss/crossentropy": 2.398258686065674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2411409318447113, "step": 8038 }, { "epoch": 0.1608, "grad_norm": 2.140625, "grad_norm_var": 0.015579986572265624, "learning_rate": 0.0001, "loss": 4.0415, "loss/crossentropy": 2.3101454973220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22493668645620346, "step": 8040 }, { "epoch": 0.16084, "grad_norm": 2.203125, "grad_norm_var": 0.015134429931640625, "learning_rate": 0.0001, "loss": 4.0612, "loss/crossentropy": 1.9015105962753296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22360052913427353, "step": 8042 }, { "epoch": 0.16088, "grad_norm": 2.34375, "grad_norm_var": 0.017704010009765625, "learning_rate": 0.0001, "loss": 4.216, "loss/crossentropy": 2.0112481117248535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24062782526016235, "step": 8044 }, { "epoch": 0.16092, "grad_norm": 2.171875, "grad_norm_var": 0.0161773681640625, "learning_rate": 0.0001, "loss": 4.3638, "loss/crossentropy": 2.2024285793304443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21959003806114197, "step": 8046 }, { "epoch": 0.16096, "grad_norm": 2.125, "grad_norm_var": 0.014412434895833333, "learning_rate": 0.0001, "loss": 4.4825, "loss/crossentropy": 2.1069058775901794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21980682760477066, "step": 8048 }, { "epoch": 0.161, "grad_norm": 2.3125, "grad_norm_var": 0.012540690104166667, "learning_rate": 0.0001, "loss": 4.2106, "loss/crossentropy": 1.8380340337753296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2201671525835991, "step": 8050 }, { "epoch": 0.16104, "grad_norm": 2.046875, "grad_norm_var": 0.010965728759765625, "learning_rate": 0.0001, "loss": 4.3778, "loss/crossentropy": 2.276741087436676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23787930607795715, "step": 8052 }, { "epoch": 0.16108, "grad_norm": 2.140625, "grad_norm_var": 0.010680898030598959, "learning_rate": 0.0001, "loss": 4.0784, "loss/crossentropy": 1.631809651851654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19654212146997452, "step": 8054 }, { "epoch": 0.16112, "grad_norm": 2.234375, "grad_norm_var": 0.007453409830729166, "learning_rate": 0.0001, "loss": 4.5927, "loss/crossentropy": 2.067444145679474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21420229971408844, "step": 8056 }, { "epoch": 0.16116, "grad_norm": 2.21875, "grad_norm_var": 0.011767578125, "learning_rate": 0.0001, "loss": 4.562, "loss/crossentropy": 2.384890556335449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2312106341123581, "step": 8058 }, { "epoch": 0.1612, "grad_norm": 2.265625, "grad_norm_var": 0.010986328125, "learning_rate": 0.0001, "loss": 4.363, "loss/crossentropy": 2.3683160543441772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527216002345085, "step": 8060 }, { "epoch": 0.16124, "grad_norm": 2.171875, "grad_norm_var": 0.011253865559895833, "learning_rate": 0.0001, "loss": 4.4576, "loss/crossentropy": 2.1545952558517456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23928315192461014, "step": 8062 }, { "epoch": 0.16128, "grad_norm": 1.96875, "grad_norm_var": 0.013695271809895833, "learning_rate": 0.0001, "loss": 4.2285, "loss/crossentropy": 2.0792208313941956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2226722463965416, "step": 8064 }, { "epoch": 0.16132, "grad_norm": 2.09375, "grad_norm_var": 0.013068644205729167, "learning_rate": 0.0001, "loss": 3.8788, "loss/crossentropy": 2.181519627571106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193506434559822, "step": 8066 }, { "epoch": 0.16136, "grad_norm": 2.375, "grad_norm_var": 0.015067545572916667, "learning_rate": 0.0001, "loss": 4.4445, "loss/crossentropy": 1.7798657417297363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23413337767124176, "step": 8068 }, { "epoch": 0.1614, "grad_norm": 2.125, "grad_norm_var": 0.017235310872395833, "learning_rate": 0.0001, "loss": 4.0731, "loss/crossentropy": 2.1232666969299316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23351240158081055, "step": 8070 }, { "epoch": 0.16144, "grad_norm": 2.203125, "grad_norm_var": 0.017609659830729166, "learning_rate": 0.0001, "loss": 4.6975, "loss/crossentropy": 2.34002685546875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2533388137817383, "step": 8072 }, { "epoch": 0.16148, "grad_norm": 2.21875, "grad_norm_var": 0.011637369791666666, "learning_rate": 0.0001, "loss": 4.3487, "loss/crossentropy": 2.066399872303009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21156150847673416, "step": 8074 }, { "epoch": 0.16152, "grad_norm": 2.171875, "grad_norm_var": 0.010677083333333334, "learning_rate": 0.0001, "loss": 4.3646, "loss/crossentropy": 2.2298463582992554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2307809516787529, "step": 8076 }, { "epoch": 0.16156, "grad_norm": 2.0625, "grad_norm_var": 0.0105865478515625, "learning_rate": 0.0001, "loss": 4.1946, "loss/crossentropy": 1.9858508110046387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21499747782945633, "step": 8078 }, { "epoch": 0.1616, "grad_norm": 2.0625, "grad_norm_var": 0.011149088541666666, "learning_rate": 0.0001, "loss": 3.9984, "loss/crossentropy": 1.5669215321540833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1889330968260765, "step": 8080 }, { "epoch": 0.16164, "grad_norm": 2.25, "grad_norm_var": 0.011881510416666666, "learning_rate": 0.0001, "loss": 4.4743, "loss/crossentropy": 2.296878218650818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2405889928340912, "step": 8082 }, { "epoch": 0.16168, "grad_norm": 2.203125, "grad_norm_var": 0.007307942708333333, "learning_rate": 0.0001, "loss": 4.2205, "loss/crossentropy": 1.954626441001892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19809379428625107, "step": 8084 }, { "epoch": 0.16172, "grad_norm": 2.3125, "grad_norm_var": 0.008687337239583334, "learning_rate": 0.0001, "loss": 4.5923, "loss/crossentropy": 1.8640353083610535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.226090669631958, "step": 8086 }, { "epoch": 0.16176, "grad_norm": 2.28125, "grad_norm_var": 0.009663899739583334, "learning_rate": 0.0001, "loss": 4.3363, "loss/crossentropy": 2.1914591789245605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24432511627674103, "step": 8088 }, { "epoch": 0.1618, "grad_norm": 2.21875, "grad_norm_var": 0.009798177083333333, "learning_rate": 0.0001, "loss": 4.7291, "loss/crossentropy": 2.192594051361084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23682628571987152, "step": 8090 }, { "epoch": 0.16184, "grad_norm": 2.09375, "grad_norm_var": 0.010993448893229167, "learning_rate": 0.0001, "loss": 4.4592, "loss/crossentropy": 2.210235595703125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.230379119515419, "step": 8092 }, { "epoch": 0.16188, "grad_norm": 2.140625, "grad_norm_var": 0.011115519205729167, "learning_rate": 0.0001, "loss": 4.4111, "loss/crossentropy": 2.214667320251465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26643867790699005, "step": 8094 }, { "epoch": 0.16192, "grad_norm": 2.09375, "grad_norm_var": 0.009586588541666666, "learning_rate": 0.0001, "loss": 4.3131, "loss/crossentropy": 1.9808599948883057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20679324120283127, "step": 8096 }, { "epoch": 0.16196, "grad_norm": 2.109375, "grad_norm_var": 0.008333333333333333, "learning_rate": 0.0001, "loss": 4.4683, "loss/crossentropy": 2.076589345932007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23430196940898895, "step": 8098 }, { "epoch": 0.162, "grad_norm": 2.15625, "grad_norm_var": 0.006883748372395833, "learning_rate": 0.0001, "loss": 4.5645, "loss/crossentropy": 2.364492177963257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23023054748773575, "step": 8100 }, { "epoch": 0.16204, "grad_norm": 2.140625, "grad_norm_var": 0.005890909830729167, "learning_rate": 0.0001, "loss": 4.6595, "loss/crossentropy": 2.2908111214637756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21239468455314636, "step": 8102 }, { "epoch": 0.16208, "grad_norm": 2.40625, "grad_norm_var": 0.008958943684895833, "learning_rate": 0.0001, "loss": 4.5503, "loss/crossentropy": 1.8306183218955994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21511316299438477, "step": 8104 }, { "epoch": 0.16212, "grad_norm": 2.0625, "grad_norm_var": 0.010724894205729167, "learning_rate": 0.0001, "loss": 4.1582, "loss/crossentropy": 2.1121758222579956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23471853882074356, "step": 8106 }, { "epoch": 0.16216, "grad_norm": 2.140625, "grad_norm_var": 0.012355295817057292, "learning_rate": 0.0001, "loss": 4.1464, "loss/crossentropy": 1.7613067030906677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025228664278984, "step": 8108 }, { "epoch": 0.1622, "grad_norm": 2.0, "grad_norm_var": 0.013844553629557292, "learning_rate": 0.0001, "loss": 4.0797, "loss/crossentropy": 2.1413058042526245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21817607432603836, "step": 8110 }, { "epoch": 0.16224, "grad_norm": 2.265625, "grad_norm_var": 0.013641103108723959, "learning_rate": 0.0001, "loss": 4.5739, "loss/crossentropy": 2.375948429107666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30982857942581177, "step": 8112 }, { "epoch": 0.16228, "grad_norm": 2.1875, "grad_norm_var": 0.013396962483723959, "learning_rate": 0.0001, "loss": 4.076, "loss/crossentropy": 1.9669193029403687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21873307973146439, "step": 8114 }, { "epoch": 0.16232, "grad_norm": 2.21875, "grad_norm_var": 0.015592193603515625, "learning_rate": 0.0001, "loss": 4.4024, "loss/crossentropy": 2.2028547525405884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.229970782995224, "step": 8116 }, { "epoch": 0.16236, "grad_norm": 2.078125, "grad_norm_var": 0.01622289021809896, "learning_rate": 0.0001, "loss": 4.3359, "loss/crossentropy": 2.082156002521515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262335941195488, "step": 8118 }, { "epoch": 0.1624, "grad_norm": 2.078125, "grad_norm_var": 0.012379709879557292, "learning_rate": 0.0001, "loss": 4.2737, "loss/crossentropy": 2.0538666248321533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22941745817661285, "step": 8120 }, { "epoch": 0.16244, "grad_norm": 2.4375, "grad_norm_var": 0.015750885009765625, "learning_rate": 0.0001, "loss": 4.3615, "loss/crossentropy": 2.338989734649658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2434876710176468, "step": 8122 }, { "epoch": 0.16248, "grad_norm": 2.375, "grad_norm_var": 2.602311197916667, "learning_rate": 0.0001, "loss": 4.5472, "loss/crossentropy": 2.277916193008423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29285988211631775, "step": 8124 }, { "epoch": 0.16252, "grad_norm": 2.46875, "grad_norm_var": 2.567577107747396, "learning_rate": 0.0001, "loss": 4.3437, "loss/crossentropy": 2.1196334958076477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20728030800819397, "step": 8126 }, { "epoch": 0.16256, "grad_norm": 2.15625, "grad_norm_var": 2.5655558268229166, "learning_rate": 0.0001, "loss": 4.4216, "loss/crossentropy": 2.0216450095176697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.229092076420784, "step": 8128 }, { "epoch": 0.1626, "grad_norm": 2.171875, "grad_norm_var": 2.5546834309895834, "learning_rate": 0.0001, "loss": 4.5163, "loss/crossentropy": 2.098921537399292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22539281845092773, "step": 8130 }, { "epoch": 0.16264, "grad_norm": 2.03125, "grad_norm_var": 2.580052693684896, "learning_rate": 0.0001, "loss": 4.1066, "loss/crossentropy": 2.0186068415641785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20286529511213303, "step": 8132 }, { "epoch": 0.16268, "grad_norm": 2.28125, "grad_norm_var": 2.5584706624348956, "learning_rate": 0.0001, "loss": 4.7847, "loss/crossentropy": 2.157357335090637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23342902958393097, "step": 8134 }, { "epoch": 0.16272, "grad_norm": 2.140625, "grad_norm_var": 2.5516998291015627, "learning_rate": 0.0001, "loss": 4.3489, "loss/crossentropy": 2.1885476112365723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22314801812171936, "step": 8136 }, { "epoch": 0.16276, "grad_norm": 2.21875, "grad_norm_var": 2.5796160380045574, "learning_rate": 0.0001, "loss": 4.3403, "loss/crossentropy": 2.2208757400512695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295266091823578, "step": 8138 }, { "epoch": 0.1628, "grad_norm": 2.015625, "grad_norm_var": 0.027522532145182292, "learning_rate": 0.0001, "loss": 4.0473, "loss/crossentropy": 1.7885233163833618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008970081806183, "step": 8140 }, { "epoch": 0.16284, "grad_norm": 2.078125, "grad_norm_var": 0.022989654541015626, "learning_rate": 0.0001, "loss": 4.0835, "loss/crossentropy": 2.152435064315796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2311881259083748, "step": 8142 }, { "epoch": 0.16288, "grad_norm": 2.296875, "grad_norm_var": 0.016001129150390626, "learning_rate": 0.0001, "loss": 4.3384, "loss/crossentropy": 2.0818406343460083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24117180705070496, "step": 8144 }, { "epoch": 0.16292, "grad_norm": 2.140625, "grad_norm_var": 0.02513402303059896, "learning_rate": 0.0001, "loss": 4.4246, "loss/crossentropy": 2.1775856614112854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23246955126523972, "step": 8146 }, { "epoch": 0.16296, "grad_norm": 2.578125, "grad_norm_var": 0.033607737223307295, "learning_rate": 0.0001, "loss": 3.9993, "loss/crossentropy": 1.7029682397842407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19564303010702133, "step": 8148 }, { "epoch": 0.163, "grad_norm": 2.453125, "grad_norm_var": 0.03468195597330729, "learning_rate": 0.0001, "loss": 4.5631, "loss/crossentropy": 2.070194900035858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2382609099149704, "step": 8150 }, { "epoch": 0.16304, "grad_norm": 2.046875, "grad_norm_var": 0.036043039957682294, "learning_rate": 0.0001, "loss": 4.19, "loss/crossentropy": 1.9214876890182495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20894134789705276, "step": 8152 }, { "epoch": 0.16308, "grad_norm": 2.21875, "grad_norm_var": 0.03243815104166667, "learning_rate": 0.0001, "loss": 4.3622, "loss/crossentropy": 1.7311474084854126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1962232068181038, "step": 8154 }, { "epoch": 0.16312, "grad_norm": 2.0, "grad_norm_var": 0.0323150634765625, "learning_rate": 0.0001, "loss": 4.4623, "loss/crossentropy": 2.2161877155303955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22318138182163239, "step": 8156 }, { "epoch": 0.16316, "grad_norm": 2.078125, "grad_norm_var": 0.03186747233072917, "learning_rate": 0.0001, "loss": 4.7343, "loss/crossentropy": 2.21865177154541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.234280064702034, "step": 8158 }, { "epoch": 0.1632, "grad_norm": 2.09375, "grad_norm_var": 0.032835896809895834, "learning_rate": 0.0001, "loss": 4.3277, "loss/crossentropy": 1.9517142176628113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1964063122868538, "step": 8160 }, { "epoch": 0.16324, "grad_norm": 2.25, "grad_norm_var": 0.023713175455729166, "learning_rate": 0.0001, "loss": 4.3878, "loss/crossentropy": 2.261234760284424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24549759924411774, "step": 8162 }, { "epoch": 0.16328, "grad_norm": 2.109375, "grad_norm_var": 0.012751261393229166, "learning_rate": 0.0001, "loss": 4.159, "loss/crossentropy": 1.9791623950004578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2018037736415863, "step": 8164 }, { "epoch": 0.16332, "grad_norm": 2.1875, "grad_norm_var": 0.005125935872395833, "learning_rate": 0.0001, "loss": 4.6231, "loss/crossentropy": 2.3916029930114746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24446460604667664, "step": 8166 }, { "epoch": 0.16336, "grad_norm": 2.328125, "grad_norm_var": 0.007991536458333334, "learning_rate": 0.0001, "loss": 4.3251, "loss/crossentropy": 2.204437553882599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22334939241409302, "step": 8168 }, { "epoch": 0.1634, "grad_norm": 2.0625, "grad_norm_var": 0.007111612955729167, "learning_rate": 0.0001, "loss": 4.3236, "loss/crossentropy": 2.2013272047042847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068365067243576, "step": 8170 }, { "epoch": 0.16344, "grad_norm": 2.03125, "grad_norm_var": 0.0065826416015625, "learning_rate": 0.0001, "loss": 4.2597, "loss/crossentropy": 1.8648701310157776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19690127670764923, "step": 8172 }, { "epoch": 0.16348, "grad_norm": 1.9453125, "grad_norm_var": 0.008857981363932291, "learning_rate": 0.0001, "loss": 4.1324, "loss/crossentropy": 2.221195936203003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1900760903954506, "step": 8174 }, { "epoch": 0.16352, "grad_norm": 2.203125, "grad_norm_var": 0.009445953369140624, "learning_rate": 0.0001, "loss": 4.4874, "loss/crossentropy": 1.8648499846458435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1982845515012741, "step": 8176 }, { "epoch": 0.16356, "grad_norm": 2.125, "grad_norm_var": 0.010762278238932292, "learning_rate": 0.0001, "loss": 4.588, "loss/crossentropy": 2.3032894134521484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25082532316446304, "step": 8178 }, { "epoch": 0.1636, "grad_norm": 2.203125, "grad_norm_var": 0.010931142171223958, "learning_rate": 0.0001, "loss": 4.3375, "loss/crossentropy": 2.0634626150131226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23546822369098663, "step": 8180 }, { "epoch": 0.16364, "grad_norm": 2.28125, "grad_norm_var": 0.012737782796223958, "learning_rate": 0.0001, "loss": 4.2402, "loss/crossentropy": 1.7406468391418457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20627902448177338, "step": 8182 }, { "epoch": 0.16368, "grad_norm": 2.125, "grad_norm_var": 0.012668609619140625, "learning_rate": 0.0001, "loss": 4.4103, "loss/crossentropy": 2.3812272548675537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25258868932724, "step": 8184 }, { "epoch": 0.16372, "grad_norm": 2.1875, "grad_norm_var": 0.012499745686848958, "learning_rate": 0.0001, "loss": 4.3584, "loss/crossentropy": 2.20754611492157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21015550196170807, "step": 8186 }, { "epoch": 0.16376, "grad_norm": 1.9921875, "grad_norm_var": 0.0140777587890625, "learning_rate": 0.0001, "loss": 4.1902, "loss/crossentropy": 2.081672966480255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19573034346103668, "step": 8188 }, { "epoch": 0.1638, "grad_norm": 2.140625, "grad_norm_var": 0.009905751546223958, "learning_rate": 0.0001, "loss": 4.5267, "loss/crossentropy": 2.184974491596222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2477174997329712, "step": 8190 }, { "epoch": 0.16384, "grad_norm": 2.125, "grad_norm_var": 0.010322825113932291, "learning_rate": 0.0001, "loss": 4.3804, "loss/crossentropy": 2.1048192977905273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22900478541851044, "step": 8192 }, { "epoch": 0.16388, "grad_norm": 2.046875, "grad_norm_var": 0.010135650634765625, "learning_rate": 0.0001, "loss": 4.1068, "loss/crossentropy": 1.960956335067749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23504969477653503, "step": 8194 }, { "epoch": 0.16392, "grad_norm": 2.484375, "grad_norm_var": 0.017114003499348957, "learning_rate": 0.0001, "loss": 4.7698, "loss/crossentropy": 2.158856213092804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200375646352768, "step": 8196 }, { "epoch": 0.16396, "grad_norm": 2.015625, "grad_norm_var": 0.017286936442057293, "learning_rate": 0.0001, "loss": 4.0533, "loss/crossentropy": 2.0954058170318604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22729048877954483, "step": 8198 }, { "epoch": 0.164, "grad_norm": 2.078125, "grad_norm_var": 0.01599299112955729, "learning_rate": 0.0001, "loss": 4.3492, "loss/crossentropy": 2.1452964544296265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21412881463766098, "step": 8200 }, { "epoch": 0.16404, "grad_norm": 1.8359375, "grad_norm_var": 0.02072321573893229, "learning_rate": 0.0001, "loss": 4.0204, "loss/crossentropy": 1.9737866520881653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19449186325073242, "step": 8202 }, { "epoch": 0.16408, "grad_norm": 2.109375, "grad_norm_var": 0.020344034830729166, "learning_rate": 0.0001, "loss": 4.5119, "loss/crossentropy": 2.213072657585144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2333887815475464, "step": 8204 }, { "epoch": 0.16412, "grad_norm": 2.09375, "grad_norm_var": 0.0203277587890625, "learning_rate": 0.0001, "loss": 4.2139, "loss/crossentropy": 1.8702161312103271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21368274092674255, "step": 8206 }, { "epoch": 0.16416, "grad_norm": 1.9765625, "grad_norm_var": 0.02102635701497396, "learning_rate": 0.0001, "loss": 4.242, "loss/crossentropy": 2.177275776863098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21248809248209, "step": 8208 }, { "epoch": 0.1642, "grad_norm": 1.9375, "grad_norm_var": 0.021945953369140625, "learning_rate": 0.0001, "loss": 4.3151, "loss/crossentropy": 2.3422038555145264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2288341298699379, "step": 8210 }, { "epoch": 0.16424, "grad_norm": 2.265625, "grad_norm_var": 0.014212799072265626, "learning_rate": 0.0001, "loss": 4.2195, "loss/crossentropy": 2.094432234764099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2224871665239334, "step": 8212 }, { "epoch": 0.16428, "grad_norm": 2.125, "grad_norm_var": 0.020072174072265626, "learning_rate": 0.0001, "loss": 4.5045, "loss/crossentropy": 2.3371682167053223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2710718363523483, "step": 8214 }, { "epoch": 0.16432, "grad_norm": 2.25, "grad_norm_var": 0.034242502848307294, "learning_rate": 0.0001, "loss": 4.6926, "loss/crossentropy": 1.8700988292694092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20795634388923645, "step": 8216 }, { "epoch": 0.16436, "grad_norm": 2.046875, "grad_norm_var": 0.02575658162434896, "learning_rate": 0.0001, "loss": 4.3266, "loss/crossentropy": 2.080985188484192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23497651517391205, "step": 8218 }, { "epoch": 0.1644, "grad_norm": 2.046875, "grad_norm_var": 0.026364898681640624, "learning_rate": 0.0001, "loss": 4.3873, "loss/crossentropy": 2.3486984968185425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25393396615982056, "step": 8220 }, { "epoch": 0.16444, "grad_norm": 2.15625, "grad_norm_var": 0.028148396809895834, "learning_rate": 0.0001, "loss": 4.3446, "loss/crossentropy": 2.0084245800971985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21738936007022858, "step": 8222 }, { "epoch": 0.16448, "grad_norm": 2.03125, "grad_norm_var": 0.02846247355143229, "learning_rate": 0.0001, "loss": 3.8584, "loss/crossentropy": 1.536482572555542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17787320166826248, "step": 8224 }, { "epoch": 0.16452, "grad_norm": 2.1875, "grad_norm_var": 0.024857330322265624, "learning_rate": 0.0001, "loss": 4.494, "loss/crossentropy": 2.1727080941200256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22967493534088135, "step": 8226 }, { "epoch": 0.16456, "grad_norm": 2.015625, "grad_norm_var": 0.02490208943684896, "learning_rate": 0.0001, "loss": 4.1895, "loss/crossentropy": 1.6915069222450256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19113170355558395, "step": 8228 }, { "epoch": 0.1646, "grad_norm": 2.046875, "grad_norm_var": 0.022739410400390625, "learning_rate": 0.0001, "loss": 4.4378, "loss/crossentropy": 2.1645957231521606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21431444585323334, "step": 8230 }, { "epoch": 0.16464, "grad_norm": 2.09375, "grad_norm_var": 0.00858154296875, "learning_rate": 0.0001, "loss": 3.8088, "loss/crossentropy": 1.8797736763954163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19344569742679596, "step": 8232 }, { "epoch": 0.16468, "grad_norm": 2.1875, "grad_norm_var": 0.0148345947265625, "learning_rate": 0.0001, "loss": 4.3119, "loss/crossentropy": 2.092893421649933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21476523578166962, "step": 8234 }, { "epoch": 0.16472, "grad_norm": 2.125, "grad_norm_var": 0.0143310546875, "learning_rate": 0.0001, "loss": 4.4145, "loss/crossentropy": 2.336071252822876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25474119186401367, "step": 8236 }, { "epoch": 0.16476, "grad_norm": 2.03125, "grad_norm_var": 0.014410146077473958, "learning_rate": 0.0001, "loss": 4.3054, "loss/crossentropy": 2.008872926235199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22672003507614136, "step": 8238 }, { "epoch": 0.1648, "grad_norm": 2.171875, "grad_norm_var": 0.011822255452473958, "learning_rate": 0.0001, "loss": 4.4997, "loss/crossentropy": 2.2229605317115784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2392737865447998, "step": 8240 }, { "epoch": 0.16484, "grad_norm": 2.171875, "grad_norm_var": 0.011525217692057292, "learning_rate": 0.0001, "loss": 4.5894, "loss/crossentropy": 2.1929808855056763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22832375764846802, "step": 8242 }, { "epoch": 0.16488, "grad_norm": 2.171875, "grad_norm_var": 0.010625966389973958, "learning_rate": 0.0001, "loss": 4.642, "loss/crossentropy": 2.1557860374450684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24640469253063202, "step": 8244 }, { "epoch": 0.16492, "grad_norm": 2.109375, "grad_norm_var": 0.009582265218098959, "learning_rate": 0.0001, "loss": 4.2508, "loss/crossentropy": 2.2462236881256104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23639583587646484, "step": 8246 }, { "epoch": 0.16496, "grad_norm": 2.03125, "grad_norm_var": 0.009501139322916666, "learning_rate": 0.0001, "loss": 4.1476, "loss/crossentropy": 2.1238350868225098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23066531121730804, "step": 8248 }, { "epoch": 0.165, "grad_norm": 2.078125, "grad_norm_var": 0.0049479166666666664, "learning_rate": 0.0001, "loss": 4.1666, "loss/crossentropy": 1.6866248846054077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20677632093429565, "step": 8250 }, { "epoch": 0.16504, "grad_norm": 2.03125, "grad_norm_var": 0.005338541666666667, "learning_rate": 0.0001, "loss": 4.1494, "loss/crossentropy": 2.0640709400177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148914858698845, "step": 8252 }, { "epoch": 0.16508, "grad_norm": 2.234375, "grad_norm_var": 0.00513916015625, "learning_rate": 0.0001, "loss": 4.4138, "loss/crossentropy": 2.003119468688965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21817681193351746, "step": 8254 }, { "epoch": 0.16512, "grad_norm": 2.328125, "grad_norm_var": 0.007763671875, "learning_rate": 0.0001, "loss": 4.276, "loss/crossentropy": 2.053581953048706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22257455438375473, "step": 8256 }, { "epoch": 0.16516, "grad_norm": 2.046875, "grad_norm_var": 0.008381144205729166, "learning_rate": 0.0001, "loss": 4.1314, "loss/crossentropy": 1.788454830646515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21449437737464905, "step": 8258 }, { "epoch": 0.1652, "grad_norm": 2.265625, "grad_norm_var": 0.009032185872395833, "learning_rate": 0.0001, "loss": 4.3986, "loss/crossentropy": 1.8791787028312683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877290904521942, "step": 8260 }, { "epoch": 0.16524, "grad_norm": 2.140625, "grad_norm_var": 0.009761555989583334, "learning_rate": 0.0001, "loss": 4.4326, "loss/crossentropy": 2.2346811294555664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.221164733171463, "step": 8262 }, { "epoch": 0.16528, "grad_norm": 2.03125, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.3296, "loss/crossentropy": 2.1032413244247437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21829386800527573, "step": 8264 }, { "epoch": 0.16532, "grad_norm": 2.09375, "grad_norm_var": 0.008690388997395833, "learning_rate": 0.0001, "loss": 4.3188, "loss/crossentropy": 2.1452749967575073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24733971804380417, "step": 8266 }, { "epoch": 0.16536, "grad_norm": 2.171875, "grad_norm_var": 0.008373006184895834, "learning_rate": 0.0001, "loss": 4.2093, "loss/crossentropy": 1.8318313956260681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22139593213796616, "step": 8268 }, { "epoch": 0.1654, "grad_norm": 2.09375, "grad_norm_var": 0.0074045817057291664, "learning_rate": 0.0001, "loss": 4.3494, "loss/crossentropy": 1.9865980744361877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22354473173618317, "step": 8270 }, { "epoch": 0.16544, "grad_norm": 2.21875, "grad_norm_var": 0.005464680989583333, "learning_rate": 0.0001, "loss": 4.318, "loss/crossentropy": 2.140509843826294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22832269966602325, "step": 8272 }, { "epoch": 0.16548, "grad_norm": 2.15625, "grad_norm_var": 0.005052693684895833, "learning_rate": 0.0001, "loss": 4.1632, "loss/crossentropy": 2.3763319849967957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22866757214069366, "step": 8274 }, { "epoch": 0.16552, "grad_norm": 2.03125, "grad_norm_var": 0.004198201497395833, "learning_rate": 0.0001, "loss": 4.2366, "loss/crossentropy": 1.785762071609497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19961480796337128, "step": 8276 }, { "epoch": 0.16556, "grad_norm": 7.34375, "grad_norm_var": 1.7181477864583334, "learning_rate": 0.0001, "loss": 4.5317, "loss/crossentropy": 2.0500977635383606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2972872704267502, "step": 8278 }, { "epoch": 0.1656, "grad_norm": 2.265625, "grad_norm_var": 1.6987589518229167, "learning_rate": 0.0001, "loss": 4.4247, "loss/crossentropy": 2.3665153980255127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24115745723247528, "step": 8280 }, { "epoch": 0.16564, "grad_norm": 2.25, "grad_norm_var": 1.6973592122395833, "learning_rate": 0.0001, "loss": 4.326, "loss/crossentropy": 1.9636226892471313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048879787325859, "step": 8282 }, { "epoch": 0.16568, "grad_norm": 2.171875, "grad_norm_var": 1.6948720296223958, "learning_rate": 0.0001, "loss": 4.4303, "loss/crossentropy": 2.2624993324279785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22186043858528137, "step": 8284 }, { "epoch": 0.16572, "grad_norm": 2.046875, "grad_norm_var": 1.7048886617024739, "learning_rate": 0.0001, "loss": 4.0816, "loss/crossentropy": 2.098397970199585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219336099922657, "step": 8286 }, { "epoch": 0.16576, "grad_norm": 2.140625, "grad_norm_var": 1.6997393290201823, "learning_rate": 0.0001, "loss": 4.4705, "loss/crossentropy": 1.9795190691947937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2145320549607277, "step": 8288 }, { "epoch": 0.1658, "grad_norm": 2.203125, "grad_norm_var": 1.6936927795410157, "learning_rate": 0.0001, "loss": 4.4625, "loss/crossentropy": 2.1028788089752197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22656689584255219, "step": 8290 }, { "epoch": 0.16584, "grad_norm": 1.984375, "grad_norm_var": 1.7015398661295573, "learning_rate": 0.0001, "loss": 4.1901, "loss/crossentropy": 2.2936136722564697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23688847571611404, "step": 8292 }, { "epoch": 0.16588, "grad_norm": 2.234375, "grad_norm_var": 0.01862360636393229, "learning_rate": 0.0001, "loss": 4.1665, "loss/crossentropy": 1.870754897594452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20914533734321594, "step": 8294 }, { "epoch": 0.16592, "grad_norm": 2.109375, "grad_norm_var": 0.007999674479166666, "learning_rate": 0.0001, "loss": 3.9884, "loss/crossentropy": 2.034530520439148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22234197705984116, "step": 8296 }, { "epoch": 0.16596, "grad_norm": 2.203125, "grad_norm_var": 0.0073811848958333336, "learning_rate": 0.0001, "loss": 4.3674, "loss/crossentropy": 1.974400520324707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20479100942611694, "step": 8298 }, { "epoch": 0.166, "grad_norm": 2.21875, "grad_norm_var": 0.0078765869140625, "learning_rate": 0.0001, "loss": 4.3157, "loss/crossentropy": 2.180828809738159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2229994386434555, "step": 8300 }, { "epoch": 0.16604, "grad_norm": 2.078125, "grad_norm_var": 0.007624308268229167, "learning_rate": 0.0001, "loss": 4.0704, "loss/crossentropy": 2.098487079143524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19709115475416183, "step": 8302 }, { "epoch": 0.16608, "grad_norm": 2.0625, "grad_norm_var": 0.007673136393229167, "learning_rate": 0.0001, "loss": 4.2292, "loss/crossentropy": 1.999358892440796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20891200006008148, "step": 8304 }, { "epoch": 0.16612, "grad_norm": 2.1875, "grad_norm_var": 0.009022776285807292, "learning_rate": 0.0001, "loss": 4.4263, "loss/crossentropy": 2.40866219997406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22355159372091293, "step": 8306 }, { "epoch": 0.16616, "grad_norm": 2.171875, "grad_norm_var": 0.008156077067057291, "learning_rate": 0.0001, "loss": 4.0532, "loss/crossentropy": 1.774325966835022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058887928724289, "step": 8308 }, { "epoch": 0.1662, "grad_norm": 2.125, "grad_norm_var": 0.007352447509765625, "learning_rate": 0.0001, "loss": 4.4951, "loss/crossentropy": 1.9870773553848267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21368569880723953, "step": 8310 }, { "epoch": 0.16624, "grad_norm": 2.40625, "grad_norm_var": 0.01121826171875, "learning_rate": 0.0001, "loss": 4.3496, "loss/crossentropy": 1.9224175810813904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2279355376958847, "step": 8312 }, { "epoch": 0.16628, "grad_norm": 2.171875, "grad_norm_var": 0.012230428059895833, "learning_rate": 0.0001, "loss": 4.4074, "loss/crossentropy": 2.011419177055359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2363004833459854, "step": 8314 }, { "epoch": 0.16632, "grad_norm": 2.328125, "grad_norm_var": 0.01444091796875, "learning_rate": 0.0001, "loss": 4.6627, "loss/crossentropy": 1.9777795672416687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21830055862665176, "step": 8316 }, { "epoch": 0.16636, "grad_norm": 2.046875, "grad_norm_var": 0.013508860270182292, "learning_rate": 0.0001, "loss": 4.3798, "loss/crossentropy": 2.3334981203079224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21089734882116318, "step": 8318 }, { "epoch": 0.1664, "grad_norm": 2.28125, "grad_norm_var": 0.015148671468098958, "learning_rate": 0.0001, "loss": 4.3792, "loss/crossentropy": 1.9137234687805176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21954041719436646, "step": 8320 }, { "epoch": 0.16644, "grad_norm": 2.15625, "grad_norm_var": 0.013492838541666666, "learning_rate": 0.0001, "loss": 4.3826, "loss/crossentropy": 2.1358155608177185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21321023255586624, "step": 8322 }, { "epoch": 0.16648, "grad_norm": 2.203125, "grad_norm_var": 0.013688151041666667, "learning_rate": 0.0001, "loss": 4.3454, "loss/crossentropy": 2.1747822165489197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23682481050491333, "step": 8324 }, { "epoch": 0.16652, "grad_norm": 2.078125, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 4.2685, "loss/crossentropy": 2.2818257808685303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24226247519254684, "step": 8326 }, { "epoch": 0.16656, "grad_norm": 2.1875, "grad_norm_var": 0.010155232747395833, "learning_rate": 0.0001, "loss": 4.4806, "loss/crossentropy": 1.9066791534423828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20610930025577545, "step": 8328 }, { "epoch": 0.1666, "grad_norm": 2.078125, "grad_norm_var": 0.008284505208333333, "learning_rate": 0.0001, "loss": 4.4799, "loss/crossentropy": 2.2431830763816833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22407780587673187, "step": 8330 }, { "epoch": 0.16664, "grad_norm": 2.125, "grad_norm_var": 0.0067708333333333336, "learning_rate": 0.0001, "loss": 4.5894, "loss/crossentropy": 2.569726347923279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2647472620010376, "step": 8332 }, { "epoch": 0.16668, "grad_norm": 2.140625, "grad_norm_var": 0.006371053059895834, "learning_rate": 0.0001, "loss": 4.2307, "loss/crossentropy": 1.8364137411117554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113686427474022, "step": 8334 }, { "epoch": 0.16672, "grad_norm": 2.078125, "grad_norm_var": 0.007968902587890625, "learning_rate": 0.0001, "loss": 4.0369, "loss/crossentropy": 2.0510441064834595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2268069088459015, "step": 8336 }, { "epoch": 0.16676, "grad_norm": 2.046875, "grad_norm_var": 0.007968902587890625, "learning_rate": 0.0001, "loss": 4.2528, "loss/crossentropy": 1.7306728959083557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20341812074184418, "step": 8338 }, { "epoch": 0.1668, "grad_norm": 2.578125, "grad_norm_var": 0.023361968994140624, "learning_rate": 0.0001, "loss": 4.8146, "loss/crossentropy": 2.398088574409485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2426001876592636, "step": 8340 }, { "epoch": 0.16684, "grad_norm": 2.109375, "grad_norm_var": 0.023128000895182292, "learning_rate": 0.0001, "loss": 4.0659, "loss/crossentropy": 2.1259487867355347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22666773200035095, "step": 8342 }, { "epoch": 0.16688, "grad_norm": 2.109375, "grad_norm_var": 0.022141265869140624, "learning_rate": 0.0001, "loss": 4.2857, "loss/crossentropy": 1.9744665026664734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2887374758720398, "step": 8344 }, { "epoch": 0.16692, "grad_norm": 2.09375, "grad_norm_var": 0.02316869099934896, "learning_rate": 0.0001, "loss": 4.1989, "loss/crossentropy": 1.9841225743293762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008478343486786, "step": 8346 }, { "epoch": 0.16696, "grad_norm": 2.125, "grad_norm_var": 0.1913469950358073, "learning_rate": 0.0001, "loss": 4.5208, "loss/crossentropy": 2.060486137866974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23954987525939941, "step": 8348 }, { "epoch": 0.167, "grad_norm": 2.03125, "grad_norm_var": 0.19230931599934895, "learning_rate": 0.0001, "loss": 4.2185, "loss/crossentropy": 1.9356245398521423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987425833940506, "step": 8350 }, { "epoch": 0.16704, "grad_norm": 2.203125, "grad_norm_var": 0.1835845947265625, "learning_rate": 0.0001, "loss": 4.1875, "loss/crossentropy": 1.9968677163124084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21396416425704956, "step": 8352 }, { "epoch": 0.16708, "grad_norm": 2.109375, "grad_norm_var": 0.1820465087890625, "learning_rate": 0.0001, "loss": 4.1696, "loss/crossentropy": 2.1678614616394043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22860293090343475, "step": 8354 }, { "epoch": 0.16712, "grad_norm": 2.078125, "grad_norm_var": 0.17534891764322916, "learning_rate": 0.0001, "loss": 4.1761, "loss/crossentropy": 1.7752264142036438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20555391907691956, "step": 8356 }, { "epoch": 0.16716, "grad_norm": 2.046875, "grad_norm_var": 0.1767578125, "learning_rate": 0.0001, "loss": 4.2391, "loss/crossentropy": 1.7073925137519836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19325412809848785, "step": 8358 }, { "epoch": 0.1672, "grad_norm": 2.4375, "grad_norm_var": 0.17827860514322916, "learning_rate": 0.0001, "loss": 4.7185, "loss/crossentropy": 2.2200660705566406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21834757924079895, "step": 8360 }, { "epoch": 0.16724, "grad_norm": 2.03125, "grad_norm_var": 0.17594401041666666, "learning_rate": 0.0001, "loss": 4.3551, "loss/crossentropy": 2.3598183393478394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25592371821403503, "step": 8362 }, { "epoch": 0.16728, "grad_norm": 2.03125, "grad_norm_var": 0.014644368489583334, "learning_rate": 0.0001, "loss": 4.3429, "loss/crossentropy": 2.342926025390625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2480776235461235, "step": 8364 }, { "epoch": 0.16732, "grad_norm": 2.046875, "grad_norm_var": 0.014411417643229167, "learning_rate": 0.0001, "loss": 4.2802, "loss/crossentropy": 1.8505961894989014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21376194059848785, "step": 8366 }, { "epoch": 0.16736, "grad_norm": 2.078125, "grad_norm_var": 0.0146636962890625, "learning_rate": 0.0001, "loss": 4.4199, "loss/crossentropy": 2.2064108848571777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21334534883499146, "step": 8368 }, { "epoch": 0.1674, "grad_norm": 2.6875, "grad_norm_var": 0.0338043212890625, "learning_rate": 0.0001, "loss": 4.5344, "loss/crossentropy": 2.1280709505081177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24501197040081024, "step": 8370 }, { "epoch": 0.16744, "grad_norm": 2.203125, "grad_norm_var": 0.03312886555989583, "learning_rate": 0.0001, "loss": 4.3382, "loss/crossentropy": 2.2134695053100586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2368277609348297, "step": 8372 }, { "epoch": 0.16748, "grad_norm": 2.03125, "grad_norm_var": 0.03319905598958333, "learning_rate": 0.0001, "loss": 4.1197, "loss/crossentropy": 1.6942040920257568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21226602047681808, "step": 8374 }, { "epoch": 0.16752, "grad_norm": 2.09375, "grad_norm_var": 0.025755818684895834, "learning_rate": 0.0001, "loss": 4.5004, "loss/crossentropy": 2.1554355025291443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655039578676224, "step": 8376 }, { "epoch": 0.16756, "grad_norm": 2.078125, "grad_norm_var": 0.024800618489583332, "learning_rate": 0.0001, "loss": 4.2446, "loss/crossentropy": 1.8107115030288696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756090670824051, "step": 8378 }, { "epoch": 0.1676, "grad_norm": 1.921875, "grad_norm_var": 0.026496378580729167, "learning_rate": 0.0001, "loss": 4.0745, "loss/crossentropy": 1.655519425868988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1760415881872177, "step": 8380 }, { "epoch": 0.16764, "grad_norm": 2.140625, "grad_norm_var": 0.026460774739583335, "learning_rate": 0.0001, "loss": 4.2369, "loss/crossentropy": 2.1618025302886963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21886380016803741, "step": 8382 }, { "epoch": 0.16768, "grad_norm": 2.109375, "grad_norm_var": 0.028270467122395834, "learning_rate": 0.0001, "loss": 4.0798, "loss/crossentropy": 2.043319880962372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21368451416492462, "step": 8384 }, { "epoch": 0.16772, "grad_norm": 2.1875, "grad_norm_var": 0.010724894205729167, "learning_rate": 0.0001, "loss": 4.4035, "loss/crossentropy": 2.3043102025985718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25452760607004166, "step": 8386 }, { "epoch": 0.16776, "grad_norm": 2.21875, "grad_norm_var": 0.011546834309895834, "learning_rate": 0.0001, "loss": 4.5633, "loss/crossentropy": 2.3627192974090576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2438303530216217, "step": 8388 }, { "epoch": 0.1678, "grad_norm": 2.109375, "grad_norm_var": 0.011335245768229167, "learning_rate": 0.0001, "loss": 4.4339, "loss/crossentropy": 2.0960012674331665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20813053101301193, "step": 8390 }, { "epoch": 0.16784, "grad_norm": 1.921875, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 4.0401, "loss/crossentropy": 1.9132550358772278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19716795533895493, "step": 8392 }, { "epoch": 0.16788, "grad_norm": 2.078125, "grad_norm_var": 0.014742024739583333, "learning_rate": 0.0001, "loss": 4.2371, "loss/crossentropy": 2.1912059783935547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.233934685587883, "step": 8394 }, { "epoch": 0.16792, "grad_norm": 2.0625, "grad_norm_var": 0.012691243489583334, "learning_rate": 0.0001, "loss": 4.4501, "loss/crossentropy": 2.165616512298584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23475389927625656, "step": 8396 }, { "epoch": 0.16796, "grad_norm": 2.078125, "grad_norm_var": 0.01265869140625, "learning_rate": 0.0001, "loss": 4.4841, "loss/crossentropy": 2.209625542163849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24280209839344025, "step": 8398 }, { "epoch": 0.168, "grad_norm": 2.125, "grad_norm_var": 0.013606516520182292, "learning_rate": 0.0001, "loss": 3.8959, "loss/crossentropy": 1.972103476524353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072778418660164, "step": 8400 }, { "epoch": 0.16804, "grad_norm": 2.171875, "grad_norm_var": 0.009663645426432292, "learning_rate": 0.0001, "loss": 4.4646, "loss/crossentropy": 2.402593731880188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23028723895549774, "step": 8402 }, { "epoch": 0.16808, "grad_norm": 2.125, "grad_norm_var": 0.008754221598307292, "learning_rate": 0.0001, "loss": 4.392, "loss/crossentropy": 2.218156576156616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24104444682598114, "step": 8404 }, { "epoch": 0.16812, "grad_norm": 2.09375, "grad_norm_var": 0.008722941080729166, "learning_rate": 0.0001, "loss": 4.2713, "loss/crossentropy": 1.9802079796791077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2134247124195099, "step": 8406 }, { "epoch": 0.16816, "grad_norm": 1.953125, "grad_norm_var": 0.008577219645182292, "learning_rate": 0.0001, "loss": 3.9765, "loss/crossentropy": 2.0322983264923096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21680974960327148, "step": 8408 }, { "epoch": 0.1682, "grad_norm": 2.03125, "grad_norm_var": 0.008194732666015624, "learning_rate": 0.0001, "loss": 4.2457, "loss/crossentropy": 2.11838436126709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2319432571530342, "step": 8410 }, { "epoch": 0.16824, "grad_norm": 2.125, "grad_norm_var": 0.013398996988932292, "learning_rate": 0.0001, "loss": 4.2204, "loss/crossentropy": 2.3801279067993164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23981253802776337, "step": 8412 }, { "epoch": 0.16828, "grad_norm": 2.078125, "grad_norm_var": 0.013042958577473958, "learning_rate": 0.0001, "loss": 4.2766, "loss/crossentropy": 2.0953307151794434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22835668921470642, "step": 8414 }, { "epoch": 0.16832, "grad_norm": 2.15625, "grad_norm_var": 0.011774698893229166, "learning_rate": 0.0001, "loss": 4.528, "loss/crossentropy": 2.0654338598251343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100541964173317, "step": 8416 }, { "epoch": 0.16836, "grad_norm": 2.0625, "grad_norm_var": 0.018184407552083334, "learning_rate": 0.0001, "loss": 4.5734, "loss/crossentropy": 2.383318305015564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.246080219745636, "step": 8418 }, { "epoch": 0.1684, "grad_norm": 2.078125, "grad_norm_var": 0.018318684895833333, "learning_rate": 0.0001, "loss": 4.2764, "loss/crossentropy": 2.103544294834137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20659767091274261, "step": 8420 }, { "epoch": 0.16844, "grad_norm": 2.09375, "grad_norm_var": 0.017437489827473958, "learning_rate": 0.0001, "loss": 4.2737, "loss/crossentropy": 2.064394950866699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23458171635866165, "step": 8422 }, { "epoch": 0.16848, "grad_norm": 2.25, "grad_norm_var": 0.014872233072916666, "learning_rate": 0.0001, "loss": 4.4133, "loss/crossentropy": 2.087044835090637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23527196049690247, "step": 8424 }, { "epoch": 0.16852, "grad_norm": 2.15625, "grad_norm_var": 0.0133941650390625, "learning_rate": 0.0001, "loss": 4.5867, "loss/crossentropy": 2.41584312915802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2425323873758316, "step": 8426 }, { "epoch": 0.16856, "grad_norm": 2.09375, "grad_norm_var": 0.010400390625, "learning_rate": 0.0001, "loss": 4.3292, "loss/crossentropy": 2.2542352080345154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2330578714609146, "step": 8428 }, { "epoch": 0.1686, "grad_norm": 2.09375, "grad_norm_var": 0.010724894205729167, "learning_rate": 0.0001, "loss": 4.6628, "loss/crossentropy": 2.453263282775879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.280864879488945, "step": 8430 }, { "epoch": 0.16864, "grad_norm": 2.140625, "grad_norm_var": 0.01197509765625, "learning_rate": 0.0001, "loss": 4.0443, "loss/crossentropy": 1.9185429811477661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19230744242668152, "step": 8432 }, { "epoch": 0.16868, "grad_norm": 2.140625, "grad_norm_var": 0.005956013997395833, "learning_rate": 0.0001, "loss": 4.1683, "loss/crossentropy": 2.020436644554138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21756108105182648, "step": 8434 }, { "epoch": 0.16872, "grad_norm": 2.328125, "grad_norm_var": 0.007616170247395833, "learning_rate": 0.0001, "loss": 4.7727, "loss/crossentropy": 2.3050636053085327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24950820207595825, "step": 8436 }, { "epoch": 0.16876, "grad_norm": 2.0625, "grad_norm_var": 0.01510009765625, "learning_rate": 0.0001, "loss": 4.4091, "loss/crossentropy": 2.2787232398986816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24267014116048813, "step": 8438 }, { "epoch": 0.1688, "grad_norm": 2.078125, "grad_norm_var": 0.015327962239583333, "learning_rate": 0.0001, "loss": 4.2927, "loss/crossentropy": 2.185176372528076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157401591539383, "step": 8440 }, { "epoch": 0.16884, "grad_norm": 2.265625, "grad_norm_var": 0.01558837890625, "learning_rate": 0.0001, "loss": 4.2087, "loss/crossentropy": 2.0673694610595703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23145683109760284, "step": 8442 }, { "epoch": 0.16888, "grad_norm": 2.171875, "grad_norm_var": 0.01754150390625, "learning_rate": 0.0001, "loss": 4.0229, "loss/crossentropy": 1.9011740684509277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21629629284143448, "step": 8444 }, { "epoch": 0.16892, "grad_norm": 2.34375, "grad_norm_var": 0.018651326497395832, "learning_rate": 0.0001, "loss": 4.4821, "loss/crossentropy": 2.055977463722229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275683432817459, "step": 8446 }, { "epoch": 0.16896, "grad_norm": 2.140625, "grad_norm_var": 0.0171295166015625, "learning_rate": 0.0001, "loss": 4.2152, "loss/crossentropy": 1.752756416797638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19437911361455917, "step": 8448 }, { "epoch": 0.169, "grad_norm": 2.09375, "grad_norm_var": 0.015706380208333332, "learning_rate": 0.0001, "loss": 4.3631, "loss/crossentropy": 2.479012131690979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23593680560588837, "step": 8450 }, { "epoch": 0.16904, "grad_norm": 2.1875, "grad_norm_var": 0.015523274739583334, "learning_rate": 0.0001, "loss": 4.3245, "loss/crossentropy": 2.065472185611725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148493528366089, "step": 8452 }, { "epoch": 0.16908, "grad_norm": 2.078125, "grad_norm_var": 0.015901692708333335, "learning_rate": 0.0001, "loss": 4.3955, "loss/crossentropy": 2.214062213897705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25444991141557693, "step": 8454 }, { "epoch": 0.16912, "grad_norm": 2.296875, "grad_norm_var": 0.0155426025390625, "learning_rate": 0.0001, "loss": 4.2059, "loss/crossentropy": 1.7410383224487305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21134207397699356, "step": 8456 }, { "epoch": 0.16916, "grad_norm": 2.234375, "grad_norm_var": 0.020542144775390625, "learning_rate": 0.0001, "loss": 4.196, "loss/crossentropy": 1.9303107857704163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18110749125480652, "step": 8458 }, { "epoch": 0.1692, "grad_norm": 2.203125, "grad_norm_var": 0.01810480753580729, "learning_rate": 0.0001, "loss": 4.3916, "loss/crossentropy": 1.9091919660568237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21320972591638565, "step": 8460 }, { "epoch": 0.16924, "grad_norm": 2.0625, "grad_norm_var": 0.025099436442057293, "learning_rate": 0.0001, "loss": 4.4075, "loss/crossentropy": 1.8021087050437927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21294282376766205, "step": 8462 }, { "epoch": 0.16928, "grad_norm": 2.0, "grad_norm_var": 0.027186838785807292, "learning_rate": 0.0001, "loss": 3.97, "loss/crossentropy": 2.018254518508911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21781788766384125, "step": 8464 }, { "epoch": 0.16932, "grad_norm": 2.0625, "grad_norm_var": 0.02906061808268229, "learning_rate": 0.0001, "loss": 4.5665, "loss/crossentropy": 2.0593737959861755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24223209917545319, "step": 8466 }, { "epoch": 0.16936, "grad_norm": 2.125, "grad_norm_var": 0.029504140218098957, "learning_rate": 0.0001, "loss": 4.3116, "loss/crossentropy": 2.218974232673645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22679369151592255, "step": 8468 }, { "epoch": 0.1694, "grad_norm": 2.078125, "grad_norm_var": 0.022332509358723957, "learning_rate": 0.0001, "loss": 4.3313, "loss/crossentropy": 2.182355046272278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23690057545900345, "step": 8470 }, { "epoch": 0.16944, "grad_norm": 2.375, "grad_norm_var": 0.024930572509765624, "learning_rate": 0.0001, "loss": 4.6241, "loss/crossentropy": 2.1669063568115234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2458687722682953, "step": 8472 }, { "epoch": 0.16948, "grad_norm": 2.171875, "grad_norm_var": 0.021100870768229165, "learning_rate": 0.0001, "loss": 4.3181, "loss/crossentropy": 2.0656558871269226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20421817898750305, "step": 8474 }, { "epoch": 0.16952, "grad_norm": 2.03125, "grad_norm_var": 0.024625651041666665, "learning_rate": 0.0001, "loss": 4.4916, "loss/crossentropy": 2.1470741033554077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20798998326063156, "step": 8476 }, { "epoch": 0.16956, "grad_norm": 2.1875, "grad_norm_var": 0.014972941080729166, "learning_rate": 0.0001, "loss": 4.2616, "loss/crossentropy": 2.206741452217102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327529340982437, "step": 8478 }, { "epoch": 0.1696, "grad_norm": 2.171875, "grad_norm_var": 0.012596638997395833, "learning_rate": 0.0001, "loss": 4.3338, "loss/crossentropy": 2.4421777725219727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2385794073343277, "step": 8480 }, { "epoch": 0.16964, "grad_norm": 2.015625, "grad_norm_var": 0.0115234375, "learning_rate": 0.0001, "loss": 3.963, "loss/crossentropy": 1.8545736074447632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18871797621250153, "step": 8482 }, { "epoch": 0.16968, "grad_norm": 2.078125, "grad_norm_var": 0.0118072509765625, "learning_rate": 0.0001, "loss": 4.1352, "loss/crossentropy": 1.807646930217743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19625309854745865, "step": 8484 }, { "epoch": 0.16972, "grad_norm": 2.1875, "grad_norm_var": 0.012202962239583334, "learning_rate": 0.0001, "loss": 4.5029, "loss/crossentropy": 1.923595905303955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2233208492398262, "step": 8486 }, { "epoch": 0.16976, "grad_norm": 2.0625, "grad_norm_var": 0.0080718994140625, "learning_rate": 0.0001, "loss": 4.3251, "loss/crossentropy": 2.1018277406692505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2556355446577072, "step": 8488 }, { "epoch": 0.1698, "grad_norm": 1.90625, "grad_norm_var": 0.01099853515625, "learning_rate": 0.0001, "loss": 4.1127, "loss/crossentropy": 1.9638542532920837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101784572005272, "step": 8490 }, { "epoch": 0.16984, "grad_norm": 2.109375, "grad_norm_var": 0.008703358968098958, "learning_rate": 0.0001, "loss": 3.9463, "loss/crossentropy": 1.7770507335662842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2004244327545166, "step": 8492 }, { "epoch": 0.16988, "grad_norm": 2.171875, "grad_norm_var": 0.008573150634765625, "learning_rate": 0.0001, "loss": 4.3901, "loss/crossentropy": 2.4064877033233643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23509501665830612, "step": 8494 }, { "epoch": 0.16992, "grad_norm": 2.0, "grad_norm_var": 0.008713531494140624, "learning_rate": 0.0001, "loss": 4.3197, "loss/crossentropy": 2.183193802833557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22985967248678207, "step": 8496 }, { "epoch": 0.16996, "grad_norm": 2.203125, "grad_norm_var": 0.009126536051432292, "learning_rate": 0.0001, "loss": 4.4983, "loss/crossentropy": 2.241714060306549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22633583843708038, "step": 8498 }, { "epoch": 0.17, "grad_norm": 1.9921875, "grad_norm_var": 0.0097808837890625, "learning_rate": 0.0001, "loss": 4.32, "loss/crossentropy": 2.212075114250183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20900271832942963, "step": 8500 }, { "epoch": 0.17004, "grad_norm": 2.09375, "grad_norm_var": 0.0071685791015625, "learning_rate": 0.0001, "loss": 4.1885, "loss/crossentropy": 2.1283940076828003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21419794112443924, "step": 8502 }, { "epoch": 0.17008, "grad_norm": 2.15625, "grad_norm_var": 0.00731201171875, "learning_rate": 0.0001, "loss": 4.3498, "loss/crossentropy": 2.1958925127983093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23375380039215088, "step": 8504 }, { "epoch": 0.17012, "grad_norm": 2.171875, "grad_norm_var": 0.0065673828125, "learning_rate": 0.0001, "loss": 4.3396, "loss/crossentropy": 1.7883376479148865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2097182646393776, "step": 8506 }, { "epoch": 0.17016, "grad_norm": 2.296875, "grad_norm_var": 0.006811269124348958, "learning_rate": 0.0001, "loss": 4.5606, "loss/crossentropy": 2.114001750946045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059268057346344, "step": 8508 }, { "epoch": 0.1702, "grad_norm": 2.15625, "grad_norm_var": 0.006929270426432292, "learning_rate": 0.0001, "loss": 4.3178, "loss/crossentropy": 2.2413129806518555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2517316862940788, "step": 8510 }, { "epoch": 0.17024, "grad_norm": 1.921875, "grad_norm_var": 0.015083567301432291, "learning_rate": 0.0001, "loss": 4.4202, "loss/crossentropy": 2.0968031883239746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2454070746898651, "step": 8512 }, { "epoch": 0.17028, "grad_norm": 1.875, "grad_norm_var": 0.01945978800455729, "learning_rate": 0.0001, "loss": 4.407, "loss/crossentropy": 1.9813454151153564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21238256990909576, "step": 8514 }, { "epoch": 0.17032, "grad_norm": 2.28125, "grad_norm_var": 0.019755045572916668, "learning_rate": 0.0001, "loss": 4.4724, "loss/crossentropy": 2.261451005935669, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2393329069018364, "step": 8516 }, { "epoch": 0.17036, "grad_norm": 2.171875, "grad_norm_var": 0.020654296875, "learning_rate": 0.0001, "loss": 4.5377, "loss/crossentropy": 1.9661999344825745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088441476225853, "step": 8518 }, { "epoch": 0.1704, "grad_norm": 2.046875, "grad_norm_var": 0.022163899739583333, "learning_rate": 0.0001, "loss": 3.9288, "loss/crossentropy": 1.8275291323661804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20091407746076584, "step": 8520 }, { "epoch": 0.17044, "grad_norm": 2.046875, "grad_norm_var": 0.023346964518229166, "learning_rate": 0.0001, "loss": 4.3154, "loss/crossentropy": 2.266944646835327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2514675632119179, "step": 8522 }, { "epoch": 0.17048, "grad_norm": 2.28125, "grad_norm_var": 0.02271728515625, "learning_rate": 0.0001, "loss": 4.3571, "loss/crossentropy": 2.244120240211487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25534868240356445, "step": 8524 }, { "epoch": 0.17052, "grad_norm": 2.015625, "grad_norm_var": 0.025048828125, "learning_rate": 0.0001, "loss": 4.3037, "loss/crossentropy": 1.8628552556037903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21675898134708405, "step": 8526 }, { "epoch": 0.17056, "grad_norm": 2.109375, "grad_norm_var": 0.01597900390625, "learning_rate": 0.0001, "loss": 4.3883, "loss/crossentropy": 1.971808135509491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2143295481801033, "step": 8528 }, { "epoch": 0.1706, "grad_norm": 2.0625, "grad_norm_var": 0.0114410400390625, "learning_rate": 0.0001, "loss": 4.3481, "loss/crossentropy": 1.959191381931305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24014096707105637, "step": 8530 }, { "epoch": 0.17064, "grad_norm": 3.0, "grad_norm_var": 0.05729878743489583, "learning_rate": 0.0001, "loss": 4.2633, "loss/crossentropy": 1.836454451084137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20957449078559875, "step": 8532 }, { "epoch": 0.17068, "grad_norm": 2.140625, "grad_norm_var": 0.06005452473958333, "learning_rate": 0.0001, "loss": 4.1254, "loss/crossentropy": 2.003947675228119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23538918048143387, "step": 8534 }, { "epoch": 0.17072, "grad_norm": 2.375, "grad_norm_var": 0.05788472493489583, "learning_rate": 0.0001, "loss": 4.4412, "loss/crossentropy": 2.0154194831848145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278473973274231, "step": 8536 }, { "epoch": 0.17076, "grad_norm": 2.015625, "grad_norm_var": 0.05548502604166667, "learning_rate": 0.0001, "loss": 4.2426, "loss/crossentropy": 2.1560275554656982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21601636707782745, "step": 8538 }, { "epoch": 0.1708, "grad_norm": 2.078125, "grad_norm_var": 0.05689697265625, "learning_rate": 0.0001, "loss": 4.2258, "loss/crossentropy": 2.1494773626327515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22883395850658417, "step": 8540 }, { "epoch": 0.17084, "grad_norm": 2.234375, "grad_norm_var": 0.05607808430989583, "learning_rate": 0.0001, "loss": 4.5201, "loss/crossentropy": 2.20908784866333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26363062113523483, "step": 8542 }, { "epoch": 0.17088, "grad_norm": 2.0625, "grad_norm_var": 0.0566802978515625, "learning_rate": 0.0001, "loss": 4.2477, "loss/crossentropy": 1.7932087182998657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18538028001785278, "step": 8544 }, { "epoch": 0.17092, "grad_norm": 2.078125, "grad_norm_var": 0.05625, "learning_rate": 0.0001, "loss": 4.471, "loss/crossentropy": 2.069899260997772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22498925775289536, "step": 8546 }, { "epoch": 0.17096, "grad_norm": 2.46875, "grad_norm_var": 0.019169108072916666, "learning_rate": 0.0001, "loss": 4.474, "loss/crossentropy": 2.008604884147644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21895557641983032, "step": 8548 }, { "epoch": 0.171, "grad_norm": 2.171875, "grad_norm_var": 0.018798828125, "learning_rate": 0.0001, "loss": 4.1973, "loss/crossentropy": 2.0652626156806946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23341115564107895, "step": 8550 }, { "epoch": 0.17104, "grad_norm": 2.234375, "grad_norm_var": 0.015607706705729167, "learning_rate": 0.0001, "loss": 4.383, "loss/crossentropy": 1.9780349135398865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22891747951507568, "step": 8552 }, { "epoch": 0.17108, "grad_norm": 2.5, "grad_norm_var": 0.022980753580729166, "learning_rate": 0.0001, "loss": 4.4424, "loss/crossentropy": 2.163089871406555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2447328343987465, "step": 8554 }, { "epoch": 0.17112, "grad_norm": 2.21875, "grad_norm_var": 0.022945149739583334, "learning_rate": 0.0001, "loss": 4.1503, "loss/crossentropy": 2.2946064472198486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24353116750717163, "step": 8556 }, { "epoch": 0.17116, "grad_norm": 2.078125, "grad_norm_var": 0.022459920247395834, "learning_rate": 0.0001, "loss": 4.3049, "loss/crossentropy": 2.0238161087036133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2153262495994568, "step": 8558 }, { "epoch": 0.1712, "grad_norm": 2.09375, "grad_norm_var": 0.02604955037434896, "learning_rate": 0.0001, "loss": 3.8636, "loss/crossentropy": 1.5683120489120483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17940818518400192, "step": 8560 }, { "epoch": 0.17124, "grad_norm": 2.296875, "grad_norm_var": 0.027337392171223957, "learning_rate": 0.0001, "loss": 4.3867, "loss/crossentropy": 1.9956589937210083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22829821705818176, "step": 8562 }, { "epoch": 0.17128, "grad_norm": 2.640625, "grad_norm_var": 0.036834462483723955, "learning_rate": 0.0001, "loss": 4.7055, "loss/crossentropy": 2.2242285013198853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23567892611026764, "step": 8564 }, { "epoch": 0.17132, "grad_norm": 2.046875, "grad_norm_var": 0.033719635009765624, "learning_rate": 0.0001, "loss": 4.274, "loss/crossentropy": 2.2143776416778564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2457498088479042, "step": 8566 }, { "epoch": 0.17136, "grad_norm": 2.15625, "grad_norm_var": 0.033782704671223955, "learning_rate": 0.0001, "loss": 4.354, "loss/crossentropy": 1.852292537689209, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24004538357257843, "step": 8568 }, { "epoch": 0.1714, "grad_norm": 2.21875, "grad_norm_var": 0.02676976521809896, "learning_rate": 0.0001, "loss": 4.4429, "loss/crossentropy": 2.311089515686035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22059186547994614, "step": 8570 }, { "epoch": 0.17144, "grad_norm": 2.0625, "grad_norm_var": 0.027675120035807292, "learning_rate": 0.0001, "loss": 4.1697, "loss/crossentropy": 1.9324169754981995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21051711589097977, "step": 8572 }, { "epoch": 0.17148, "grad_norm": 2.171875, "grad_norm_var": 0.02904052734375, "learning_rate": 0.0001, "loss": 4.2457, "loss/crossentropy": 1.982999861240387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2084646075963974, "step": 8574 }, { "epoch": 0.17152, "grad_norm": 2.078125, "grad_norm_var": 0.02505671183268229, "learning_rate": 0.0001, "loss": 4.2503, "loss/crossentropy": 2.1837204694747925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22953644394874573, "step": 8576 }, { "epoch": 0.17156, "grad_norm": 2.25, "grad_norm_var": 0.024102528889973957, "learning_rate": 0.0001, "loss": 4.4448, "loss/crossentropy": 2.2588841319084167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22851599752902985, "step": 8578 }, { "epoch": 0.1716, "grad_norm": 2.390625, "grad_norm_var": 0.011502838134765625, "learning_rate": 0.0001, "loss": 4.2033, "loss/crossentropy": 1.982733964920044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21284686028957367, "step": 8580 }, { "epoch": 0.17164, "grad_norm": 2.046875, "grad_norm_var": 0.012143707275390625, "learning_rate": 0.0001, "loss": 4.1238, "loss/crossentropy": 2.443873167037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23966002464294434, "step": 8582 }, { "epoch": 0.17168, "grad_norm": 5.09375, "grad_norm_var": 0.5580645243326823, "learning_rate": 0.0001, "loss": 4.2654, "loss/crossentropy": 2.462417483329773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2341725453734398, "step": 8584 }, { "epoch": 0.17172, "grad_norm": 2.453125, "grad_norm_var": 0.5521705627441407, "learning_rate": 0.0001, "loss": 4.183, "loss/crossentropy": 1.8569464683532715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224809430539608, "step": 8586 }, { "epoch": 0.17176, "grad_norm": 2.609375, "grad_norm_var": 0.55804443359375, "learning_rate": 0.0001, "loss": 4.2762, "loss/crossentropy": 2.017254650592804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2259984165430069, "step": 8588 }, { "epoch": 0.1718, "grad_norm": 2.1875, "grad_norm_var": 0.550066884358724, "learning_rate": 0.0001, "loss": 4.463, "loss/crossentropy": 1.9804525971412659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20844170451164246, "step": 8590 }, { "epoch": 0.17184, "grad_norm": 2.015625, "grad_norm_var": 0.544781239827474, "learning_rate": 0.0001, "loss": 4.2676, "loss/crossentropy": 2.2506592869758606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22404606640338898, "step": 8592 }, { "epoch": 0.17188, "grad_norm": 2.0625, "grad_norm_var": 0.5579335530598958, "learning_rate": 0.0001, "loss": 4.197, "loss/crossentropy": 1.9515153765678406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.216490276157856, "step": 8594 }, { "epoch": 0.17192, "grad_norm": 2.109375, "grad_norm_var": 0.5598052978515625, "learning_rate": 0.0001, "loss": 4.4037, "loss/crossentropy": 2.090883791446686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22496677190065384, "step": 8596 }, { "epoch": 0.17196, "grad_norm": 2.828125, "grad_norm_var": 0.5523844401041667, "learning_rate": 0.0001, "loss": 4.8429, "loss/crossentropy": 2.4344359636306763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2748369127511978, "step": 8598 }, { "epoch": 0.172, "grad_norm": 2.15625, "grad_norm_var": 0.05840250651041667, "learning_rate": 0.0001, "loss": 4.4359, "loss/crossentropy": 1.9280555844306946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20140594244003296, "step": 8600 }, { "epoch": 0.17204, "grad_norm": 2.15625, "grad_norm_var": 0.05537007649739583, "learning_rate": 0.0001, "loss": 4.2802, "loss/crossentropy": 2.045006573200226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21741003543138504, "step": 8602 }, { "epoch": 0.17208, "grad_norm": 2.0625, "grad_norm_var": 0.04461034138997396, "learning_rate": 0.0001, "loss": 4.1622, "loss/crossentropy": 2.092265546321869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20971956849098206, "step": 8604 }, { "epoch": 0.17212, "grad_norm": 1.9375, "grad_norm_var": 0.048130035400390625, "learning_rate": 0.0001, "loss": 4.16, "loss/crossentropy": 1.794768512248993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20680297911167145, "step": 8606 }, { "epoch": 0.17216, "grad_norm": 2.109375, "grad_norm_var": 0.05690078735351563, "learning_rate": 0.0001, "loss": 4.2213, "loss/crossentropy": 1.9316805601119995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2163936346769333, "step": 8608 }, { "epoch": 0.1722, "grad_norm": 2.359375, "grad_norm_var": 0.05872294108072917, "learning_rate": 0.0001, "loss": 4.5981, "loss/crossentropy": 2.2786675691604614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2411205694079399, "step": 8610 }, { "epoch": 0.17224, "grad_norm": 2.234375, "grad_norm_var": 0.05852457682291667, "learning_rate": 0.0001, "loss": 4.5688, "loss/crossentropy": 1.9211469888687134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23195043951272964, "step": 8612 }, { "epoch": 0.17228, "grad_norm": 1.9921875, "grad_norm_var": 0.037393951416015626, "learning_rate": 0.0001, "loss": 4.1776, "loss/crossentropy": 1.900360643863678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2339140772819519, "step": 8614 }, { "epoch": 0.17232, "grad_norm": 2.125, "grad_norm_var": 0.03743464152018229, "learning_rate": 0.0001, "loss": 4.4408, "loss/crossentropy": 1.976994514465332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.218951515853405, "step": 8616 }, { "epoch": 0.17236, "grad_norm": 2.09375, "grad_norm_var": 0.035982004801432294, "learning_rate": 0.0001, "loss": 4.1988, "loss/crossentropy": 2.045244038105011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20886047929525375, "step": 8618 }, { "epoch": 0.1724, "grad_norm": 2.25, "grad_norm_var": 0.03240534464518229, "learning_rate": 0.0001, "loss": 4.5082, "loss/crossentropy": 2.21256685256958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21809116005897522, "step": 8620 }, { "epoch": 0.17244, "grad_norm": 2.171875, "grad_norm_var": 0.02641779581705729, "learning_rate": 0.0001, "loss": 4.6127, "loss/crossentropy": 2.300337314605713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22060109674930573, "step": 8622 }, { "epoch": 0.17248, "grad_norm": 2.015625, "grad_norm_var": 0.016932932535807292, "learning_rate": 0.0001, "loss": 4.2411, "loss/crossentropy": 1.8734883666038513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20403072237968445, "step": 8624 }, { "epoch": 0.17252, "grad_norm": 2.5625, "grad_norm_var": 0.01962865193684896, "learning_rate": 0.0001, "loss": 4.6907, "loss/crossentropy": 2.1382813453674316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27397096157073975, "step": 8626 }, { "epoch": 0.17256, "grad_norm": 2.0625, "grad_norm_var": 0.019760894775390624, "learning_rate": 0.0001, "loss": 4.0917, "loss/crossentropy": 1.9718505144119263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20658842474222183, "step": 8628 }, { "epoch": 0.1726, "grad_norm": 2.125, "grad_norm_var": 0.016706339518229165, "learning_rate": 0.0001, "loss": 4.5233, "loss/crossentropy": 2.0957319736480713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2187560573220253, "step": 8630 }, { "epoch": 0.17264, "grad_norm": 1.984375, "grad_norm_var": 0.019954427083333334, "learning_rate": 0.0001, "loss": 4.0986, "loss/crossentropy": 2.0504234433174133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22376062721014023, "step": 8632 }, { "epoch": 0.17268, "grad_norm": 2.21875, "grad_norm_var": 0.05233968098958333, "learning_rate": 0.0001, "loss": 4.4513, "loss/crossentropy": 2.057171046733856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23209689557552338, "step": 8634 }, { "epoch": 0.17272, "grad_norm": 2.109375, "grad_norm_var": 0.052611287434895834, "learning_rate": 0.0001, "loss": 4.3473, "loss/crossentropy": 1.9635317921638489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21854296326637268, "step": 8636 }, { "epoch": 0.17276, "grad_norm": 2.359375, "grad_norm_var": 0.0546539306640625, "learning_rate": 0.0001, "loss": 4.3124, "loss/crossentropy": 1.8973188400268555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24171262234449387, "step": 8638 }, { "epoch": 0.1728, "grad_norm": 2.109375, "grad_norm_var": 0.05347900390625, "learning_rate": 0.0001, "loss": 4.2068, "loss/crossentropy": 1.6730469465255737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17975886166095734, "step": 8640 }, { "epoch": 0.17284, "grad_norm": 2.265625, "grad_norm_var": 0.04702123006184896, "learning_rate": 0.0001, "loss": 4.4222, "loss/crossentropy": 2.2531689405441284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170827016234398, "step": 8642 }, { "epoch": 0.17288, "grad_norm": 2.171875, "grad_norm_var": 0.045873769124348956, "learning_rate": 0.0001, "loss": 4.0249, "loss/crossentropy": 2.0913639068603516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22586838155984879, "step": 8644 }, { "epoch": 0.17292, "grad_norm": 2.140625, "grad_norm_var": 0.04533869425455729, "learning_rate": 0.0001, "loss": 4.45, "loss/crossentropy": 2.163489580154419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2290281057357788, "step": 8646 }, { "epoch": 0.17296, "grad_norm": 2.15625, "grad_norm_var": 0.04267552693684896, "learning_rate": 0.0001, "loss": 4.3198, "loss/crossentropy": 2.0669034719467163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22411519289016724, "step": 8648 }, { "epoch": 0.173, "grad_norm": 2.046875, "grad_norm_var": 0.008213043212890625, "learning_rate": 0.0001, "loss": 4.0474, "loss/crossentropy": 1.9942336678504944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21262076497077942, "step": 8650 }, { "epoch": 0.17304, "grad_norm": 2.1875, "grad_norm_var": 0.009474436442057291, "learning_rate": 0.0001, "loss": 4.2701, "loss/crossentropy": 2.046514868736267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2376151606440544, "step": 8652 }, { "epoch": 0.17308, "grad_norm": 2.046875, "grad_norm_var": 0.005995432535807292, "learning_rate": 0.0001, "loss": 4.3308, "loss/crossentropy": 1.8385429382324219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19642101973295212, "step": 8654 }, { "epoch": 0.17312, "grad_norm": 2.125, "grad_norm_var": 0.007252756754557292, "learning_rate": 0.0001, "loss": 4.446, "loss/crossentropy": 2.259633481502533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24319174885749817, "step": 8656 }, { "epoch": 0.17316, "grad_norm": 2.15625, "grad_norm_var": 0.005301920572916666, "learning_rate": 0.0001, "loss": 4.2067, "loss/crossentropy": 1.9811018109321594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21801364421844482, "step": 8658 }, { "epoch": 0.1732, "grad_norm": 2.203125, "grad_norm_var": 0.0059722900390625, "learning_rate": 0.0001, "loss": 4.2158, "loss/crossentropy": 2.1726362705230713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23193368315696716, "step": 8660 }, { "epoch": 0.17324, "grad_norm": 2.1875, "grad_norm_var": 0.006086222330729167, "learning_rate": 0.0001, "loss": 4.2587, "loss/crossentropy": 1.9915854930877686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111910656094551, "step": 8662 }, { "epoch": 0.17328, "grad_norm": 2.671875, "grad_norm_var": 0.026851399739583334, "learning_rate": 0.0001, "loss": 4.5001, "loss/crossentropy": 1.9651137590408325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.242890365421772, "step": 8664 }, { "epoch": 0.17332, "grad_norm": 2.15625, "grad_norm_var": 0.026008097330729167, "learning_rate": 0.0001, "loss": 4.3848, "loss/crossentropy": 1.865262508392334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21065659821033478, "step": 8666 }, { "epoch": 0.17336, "grad_norm": 2.0625, "grad_norm_var": 0.02535400390625, "learning_rate": 0.0001, "loss": 4.3402, "loss/crossentropy": 1.9073076248168945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1959603875875473, "step": 8668 }, { "epoch": 0.1734, "grad_norm": 2.078125, "grad_norm_var": 0.024332682291666668, "learning_rate": 0.0001, "loss": 4.2965, "loss/crossentropy": 2.167983889579773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22176912426948547, "step": 8670 }, { "epoch": 0.17344, "grad_norm": 2.34375, "grad_norm_var": 0.5460896809895833, "learning_rate": 0.0001, "loss": 4.4551, "loss/crossentropy": 1.7029761672019958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.222617506980896, "step": 8672 }, { "epoch": 0.17348, "grad_norm": 2.046875, "grad_norm_var": 0.5439849853515625, "learning_rate": 0.0001, "loss": 4.3414, "loss/crossentropy": 2.053748309612274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23398682475090027, "step": 8674 }, { "epoch": 0.17352, "grad_norm": 2.703125, "grad_norm_var": 0.5408274332682291, "learning_rate": 0.0001, "loss": 4.7882, "loss/crossentropy": 2.309812903404236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2961876690387726, "step": 8676 }, { "epoch": 0.17356, "grad_norm": 1.9765625, "grad_norm_var": 0.5421953837076823, "learning_rate": 0.0001, "loss": 4.4416, "loss/crossentropy": 2.1045809984207153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22072184830904007, "step": 8678 }, { "epoch": 0.1736, "grad_norm": 2.015625, "grad_norm_var": 0.5490435282389323, "learning_rate": 0.0001, "loss": 4.4437, "loss/crossentropy": 2.2114070653915405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2208957076072693, "step": 8680 }, { "epoch": 0.17364, "grad_norm": 2.21875, "grad_norm_var": 0.5490435282389323, "learning_rate": 0.0001, "loss": 4.5931, "loss/crossentropy": 2.1773669719696045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22639526426792145, "step": 8682 }, { "epoch": 0.17368, "grad_norm": 2.234375, "grad_norm_var": 0.5398272196451823, "learning_rate": 0.0001, "loss": 3.9397, "loss/crossentropy": 1.4213417768478394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17653951048851013, "step": 8684 }, { "epoch": 0.17372, "grad_norm": 1.9609375, "grad_norm_var": 0.5425374348958333, "learning_rate": 0.0001, "loss": 4.2711, "loss/crossentropy": 1.968630075454712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2007492408156395, "step": 8686 }, { "epoch": 0.17376, "grad_norm": 2.21875, "grad_norm_var": 0.030304972330729166, "learning_rate": 0.0001, "loss": 4.2297, "loss/crossentropy": 2.0826632976531982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21895240992307663, "step": 8688 }, { "epoch": 0.1738, "grad_norm": 2.21875, "grad_norm_var": 0.029002888997395834, "learning_rate": 0.0001, "loss": 4.4188, "loss/crossentropy": 2.2756701707839966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22170037031173706, "step": 8690 }, { "epoch": 0.17384, "grad_norm": 2.0625, "grad_norm_var": 0.010773722330729167, "learning_rate": 0.0001, "loss": 4.51, "loss/crossentropy": 2.329536557197571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24717354029417038, "step": 8692 }, { "epoch": 0.17388, "grad_norm": 1.9921875, "grad_norm_var": 0.010033162434895833, "learning_rate": 0.0001, "loss": 4.0776, "loss/crossentropy": 2.077241063117981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.215146966278553, "step": 8694 }, { "epoch": 0.17392, "grad_norm": 2.109375, "grad_norm_var": 0.0091949462890625, "learning_rate": 0.0001, "loss": 4.3482, "loss/crossentropy": 2.2363221645355225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23428452014923096, "step": 8696 }, { "epoch": 0.17396, "grad_norm": 2.0, "grad_norm_var": 0.01024169921875, "learning_rate": 0.0001, "loss": 4.1321, "loss/crossentropy": 2.055815279483795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22674524784088135, "step": 8698 }, { "epoch": 0.174, "grad_norm": 2.015625, "grad_norm_var": 0.0098541259765625, "learning_rate": 0.0001, "loss": 4.2789, "loss/crossentropy": 2.205570936203003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23787499964237213, "step": 8700 }, { "epoch": 0.17404, "grad_norm": 2.0625, "grad_norm_var": 0.009895579020182291, "learning_rate": 0.0001, "loss": 4.4636, "loss/crossentropy": 2.262540578842163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23921719938516617, "step": 8702 }, { "epoch": 0.17408, "grad_norm": 1.9921875, "grad_norm_var": 0.010114542643229167, "learning_rate": 0.0001, "loss": 4.3372, "loss/crossentropy": 2.5464816093444824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23302219063043594, "step": 8704 }, { "epoch": 0.17412, "grad_norm": 2.09375, "grad_norm_var": 0.00897216796875, "learning_rate": 0.0001, "loss": 4.2355, "loss/crossentropy": 2.050383508205414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23393510282039642, "step": 8706 }, { "epoch": 0.17416, "grad_norm": 2.046875, "grad_norm_var": 0.0076812744140625, "learning_rate": 0.0001, "loss": 3.9943, "loss/crossentropy": 1.9034642577171326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20801686495542526, "step": 8708 }, { "epoch": 0.1742, "grad_norm": 2.09375, "grad_norm_var": 0.007045237223307291, "learning_rate": 0.0001, "loss": 4.2801, "loss/crossentropy": 2.313044309616089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24881915748119354, "step": 8710 }, { "epoch": 0.17424, "grad_norm": 2.015625, "grad_norm_var": 0.009388987223307292, "learning_rate": 0.0001, "loss": 4.3754, "loss/crossentropy": 1.973829746246338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21400053054094315, "step": 8712 }, { "epoch": 0.17428, "grad_norm": 2.25, "grad_norm_var": 0.010109202067057291, "learning_rate": 0.0001, "loss": 4.2936, "loss/crossentropy": 1.831783950328827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20855721831321716, "step": 8714 }, { "epoch": 0.17432, "grad_norm": 2.15625, "grad_norm_var": 0.009683990478515625, "learning_rate": 0.0001, "loss": 4.2681, "loss/crossentropy": 2.0173734426498413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22365443408489227, "step": 8716 }, { "epoch": 0.17436, "grad_norm": 2.125, "grad_norm_var": 0.007155100504557292, "learning_rate": 0.0001, "loss": 4.4177, "loss/crossentropy": 1.6534234285354614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20528900623321533, "step": 8718 }, { "epoch": 0.1744, "grad_norm": 2.125, "grad_norm_var": 0.006396484375, "learning_rate": 0.0001, "loss": 4.3658, "loss/crossentropy": 1.8113531470298767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21433213353157043, "step": 8720 }, { "epoch": 0.17444, "grad_norm": 2.125, "grad_norm_var": 0.0064280192057291664, "learning_rate": 0.0001, "loss": 4.5135, "loss/crossentropy": 2.0750836730003357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22202756255865097, "step": 8722 }, { "epoch": 0.17448, "grad_norm": 2.046875, "grad_norm_var": 0.0056955973307291664, "learning_rate": 0.0001, "loss": 4.2512, "loss/crossentropy": 2.1388206481933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24137140065431595, "step": 8724 }, { "epoch": 0.17452, "grad_norm": 2.0625, "grad_norm_var": 0.006745402018229167, "learning_rate": 0.0001, "loss": 4.0401, "loss/crossentropy": 2.068696141242981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215234711766243, "step": 8726 }, { "epoch": 0.17456, "grad_norm": 1.984375, "grad_norm_var": 0.005280558268229167, "learning_rate": 0.0001, "loss": 4.0277, "loss/crossentropy": 1.6970900893211365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21108710020780563, "step": 8728 }, { "epoch": 0.1746, "grad_norm": 2.15625, "grad_norm_var": 0.005582682291666667, "learning_rate": 0.0001, "loss": 3.8555, "loss/crossentropy": 1.8847576975822449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151619866490364, "step": 8730 }, { "epoch": 0.17464, "grad_norm": 2.328125, "grad_norm_var": 0.009403483072916666, "learning_rate": 0.0001, "loss": 4.4088, "loss/crossentropy": 2.4103721380233765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2589537426829338, "step": 8732 }, { "epoch": 0.17468, "grad_norm": 2.203125, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 4.1415, "loss/crossentropy": 1.8340824842453003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22282177209854126, "step": 8734 }, { "epoch": 0.17472, "grad_norm": 2.078125, "grad_norm_var": 0.010302734375, "learning_rate": 0.0001, "loss": 4.2472, "loss/crossentropy": 1.88236665725708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193661779165268, "step": 8736 }, { "epoch": 0.17476, "grad_norm": 2.28125, "grad_norm_var": 0.05056050618489583, "learning_rate": 0.0001, "loss": 4.441, "loss/crossentropy": 1.8121293783187866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21855003386735916, "step": 8738 }, { "epoch": 0.1748, "grad_norm": 2.203125, "grad_norm_var": 0.05090738932291667, "learning_rate": 0.0001, "loss": 4.1422, "loss/crossentropy": 2.215694308280945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2405528798699379, "step": 8740 }, { "epoch": 0.17484, "grad_norm": 2.0625, "grad_norm_var": 0.05090738932291667, "learning_rate": 0.0001, "loss": 4.2431, "loss/crossentropy": 2.124837279319763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22565175592899323, "step": 8742 }, { "epoch": 0.17488, "grad_norm": 2.203125, "grad_norm_var": 0.0476959228515625, "learning_rate": 0.0001, "loss": 4.6783, "loss/crossentropy": 2.2531429529190063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.259593665599823, "step": 8744 }, { "epoch": 0.17492, "grad_norm": 2.046875, "grad_norm_var": 0.04537760416666667, "learning_rate": 0.0001, "loss": 4.3546, "loss/crossentropy": 2.403178572654724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2475409209728241, "step": 8746 }, { "epoch": 0.17496, "grad_norm": 2.328125, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 4.5698, "loss/crossentropy": 1.7886858582496643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21677076816558838, "step": 8748 }, { "epoch": 0.175, "grad_norm": 2.109375, "grad_norm_var": 0.04397379557291667, "learning_rate": 0.0001, "loss": 4.1374, "loss/crossentropy": 1.9257569313049316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2098483294248581, "step": 8750 }, { "epoch": 0.17504, "grad_norm": 2.078125, "grad_norm_var": 0.04396870930989583, "learning_rate": 0.0001, "loss": 4.407, "loss/crossentropy": 2.1609140634536743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22846446931362152, "step": 8752 }, { "epoch": 0.17508, "grad_norm": 2.0625, "grad_norm_var": 0.0081939697265625, "learning_rate": 0.0001, "loss": 4.2683, "loss/crossentropy": 2.529700756072998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2515157088637352, "step": 8754 }, { "epoch": 0.17512, "grad_norm": 1.9921875, "grad_norm_var": 0.008548736572265625, "learning_rate": 0.0001, "loss": 4.4965, "loss/crossentropy": 2.1920565366744995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24070476740598679, "step": 8756 }, { "epoch": 0.17516, "grad_norm": 1.9453125, "grad_norm_var": 0.0097900390625, "learning_rate": 0.0001, "loss": 3.8416, "loss/crossentropy": 1.7714558839797974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1787928193807602, "step": 8758 }, { "epoch": 0.1752, "grad_norm": 2.171875, "grad_norm_var": 0.009666951497395833, "learning_rate": 0.0001, "loss": 4.4457, "loss/crossentropy": 1.986818790435791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20736730098724365, "step": 8760 }, { "epoch": 0.17524, "grad_norm": 2.015625, "grad_norm_var": 0.010453287760416667, "learning_rate": 0.0001, "loss": 4.2731, "loss/crossentropy": 1.8152282238006592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22459527850151062, "step": 8762 }, { "epoch": 0.17528, "grad_norm": 2.109375, "grad_norm_var": 0.0069163004557291664, "learning_rate": 0.0001, "loss": 4.4058, "loss/crossentropy": 2.2312777042388916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23017627000808716, "step": 8764 }, { "epoch": 0.17532, "grad_norm": 2.375, "grad_norm_var": 0.013252766927083333, "learning_rate": 0.0001, "loss": 4.4714, "loss/crossentropy": 2.107849955558777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21224269270896912, "step": 8766 }, { "epoch": 0.17536, "grad_norm": 2.15625, "grad_norm_var": 0.013206990559895833, "learning_rate": 0.0001, "loss": 4.1641, "loss/crossentropy": 2.1588711738586426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200954109430313, "step": 8768 }, { "epoch": 0.1754, "grad_norm": 2.1875, "grad_norm_var": 0.0130126953125, "learning_rate": 0.0001, "loss": 4.4056, "loss/crossentropy": 2.1355313062667847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24048195779323578, "step": 8770 }, { "epoch": 0.17544, "grad_norm": 2.125, "grad_norm_var": 0.011557769775390626, "learning_rate": 0.0001, "loss": 4.3505, "loss/crossentropy": 2.477591037750244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24459562450647354, "step": 8772 }, { "epoch": 0.17548, "grad_norm": 2.046875, "grad_norm_var": 0.00845947265625, "learning_rate": 0.0001, "loss": 4.4184, "loss/crossentropy": 2.2577285766601562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23038798570632935, "step": 8774 }, { "epoch": 0.17552, "grad_norm": 2.15625, "grad_norm_var": 0.009813435872395833, "learning_rate": 0.0001, "loss": 4.2681, "loss/crossentropy": 2.239536762237549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22290733456611633, "step": 8776 }, { "epoch": 0.17556, "grad_norm": 2.125, "grad_norm_var": 0.007373046875, "learning_rate": 0.0001, "loss": 4.4351, "loss/crossentropy": 1.9139958024024963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2175864800810814, "step": 8778 }, { "epoch": 0.1756, "grad_norm": 2.0625, "grad_norm_var": 0.00888671875, "learning_rate": 0.0001, "loss": 4.0756, "loss/crossentropy": 2.0622661113739014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2223067432641983, "step": 8780 }, { "epoch": 0.17564, "grad_norm": 2.15625, "grad_norm_var": 0.004524739583333334, "learning_rate": 0.0001, "loss": 4.4552, "loss/crossentropy": 2.222475051879883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21181221306324005, "step": 8782 }, { "epoch": 0.17568, "grad_norm": 1.9296875, "grad_norm_var": 0.007002512613932292, "learning_rate": 0.0001, "loss": 4.1696, "loss/crossentropy": 2.2612074613571167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24079158157110214, "step": 8784 }, { "epoch": 0.17572, "grad_norm": 2.078125, "grad_norm_var": 0.005863189697265625, "learning_rate": 0.0001, "loss": 4.0889, "loss/crossentropy": 2.1629387736320496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21437199413776398, "step": 8786 }, { "epoch": 0.17576, "grad_norm": 2.328125, "grad_norm_var": 0.009492746988932292, "learning_rate": 0.0001, "loss": 4.6583, "loss/crossentropy": 2.145151972770691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23797442018985748, "step": 8788 }, { "epoch": 0.1758, "grad_norm": 2.453125, "grad_norm_var": 0.015457916259765624, "learning_rate": 0.0001, "loss": 4.5188, "loss/crossentropy": 2.0366984605789185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2191104218363762, "step": 8790 }, { "epoch": 0.17584, "grad_norm": 2.203125, "grad_norm_var": 0.014422353108723958, "learning_rate": 0.0001, "loss": 4.1604, "loss/crossentropy": 2.1049715280532837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.230033777654171, "step": 8792 }, { "epoch": 0.17588, "grad_norm": 2.15625, "grad_norm_var": 0.014338938395182292, "learning_rate": 0.0001, "loss": 4.4327, "loss/crossentropy": 2.2549991607666016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23084092140197754, "step": 8794 }, { "epoch": 0.17592, "grad_norm": 2.125, "grad_norm_var": 0.015547688802083333, "learning_rate": 0.0001, "loss": 4.4653, "loss/crossentropy": 1.9873813390731812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19879335910081863, "step": 8796 }, { "epoch": 0.17596, "grad_norm": 2.140625, "grad_norm_var": 0.015677897135416667, "learning_rate": 0.0001, "loss": 4.6371, "loss/crossentropy": 2.0723283886909485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20538055896759033, "step": 8798 }, { "epoch": 0.176, "grad_norm": 2.125, "grad_norm_var": 0.013099924723307291, "learning_rate": 0.0001, "loss": 4.0313, "loss/crossentropy": 2.090642750263214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2329270914196968, "step": 8800 }, { "epoch": 0.17604, "grad_norm": 2.1875, "grad_norm_var": 0.012672678629557291, "learning_rate": 0.0001, "loss": 4.4626, "loss/crossentropy": 2.3432271480560303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2456662431359291, "step": 8802 }, { "epoch": 0.17608, "grad_norm": 2.21875, "grad_norm_var": 0.010465240478515625, "learning_rate": 0.0001, "loss": 4.5133, "loss/crossentropy": 2.1210837364196777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22580894827842712, "step": 8804 }, { "epoch": 0.17612, "grad_norm": 2.0, "grad_norm_var": 0.005147043863932292, "learning_rate": 0.0001, "loss": 4.024, "loss/crossentropy": 2.142494797706604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22426588833332062, "step": 8806 }, { "epoch": 0.17616, "grad_norm": 2.0625, "grad_norm_var": 0.0049435933430989586, "learning_rate": 0.0001, "loss": 4.2877, "loss/crossentropy": 1.9163227677345276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21529845893383026, "step": 8808 }, { "epoch": 0.1762, "grad_norm": 2.140625, "grad_norm_var": 0.004937489827473958, "learning_rate": 0.0001, "loss": 4.4776, "loss/crossentropy": 2.1478612422943115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2354428842663765, "step": 8810 }, { "epoch": 0.17624, "grad_norm": 2.15625, "grad_norm_var": 0.0033854166666666668, "learning_rate": 0.0001, "loss": 4.435, "loss/crossentropy": 2.1546601057052612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22553270310163498, "step": 8812 }, { "epoch": 0.17628, "grad_norm": 2.140625, "grad_norm_var": 0.004233551025390625, "learning_rate": 0.0001, "loss": 4.1661, "loss/crossentropy": 2.0559862852096558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19714127480983734, "step": 8814 }, { "epoch": 0.17632, "grad_norm": 2.140625, "grad_norm_var": 0.004078928629557292, "learning_rate": 0.0001, "loss": 4.3543, "loss/crossentropy": 2.1340363025665283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22422882914543152, "step": 8816 }, { "epoch": 0.17636, "grad_norm": 2.078125, "grad_norm_var": 0.0038937886555989584, "learning_rate": 0.0001, "loss": 4.4464, "loss/crossentropy": 2.265942335128784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23249086737632751, "step": 8818 }, { "epoch": 0.1764, "grad_norm": 2.0625, "grad_norm_var": 0.0031939188639322916, "learning_rate": 0.0001, "loss": 4.3187, "loss/crossentropy": 2.245513081550598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22869569063186646, "step": 8820 }, { "epoch": 0.17644, "grad_norm": 2.0, "grad_norm_var": 0.006461334228515625, "learning_rate": 0.0001, "loss": 4.179, "loss/crossentropy": 1.851025104522705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21989689767360687, "step": 8822 }, { "epoch": 0.17648, "grad_norm": 2.09375, "grad_norm_var": 0.006266021728515625, "learning_rate": 0.0001, "loss": 4.276, "loss/crossentropy": 2.2972241640090942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22793132066726685, "step": 8824 }, { "epoch": 0.17652, "grad_norm": 2.078125, "grad_norm_var": 0.006276194254557292, "learning_rate": 0.0001, "loss": 4.3955, "loss/crossentropy": 2.248735189437866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2133597657084465, "step": 8826 }, { "epoch": 0.17656, "grad_norm": 2.15625, "grad_norm_var": 0.008314768473307291, "learning_rate": 0.0001, "loss": 4.4423, "loss/crossentropy": 2.4173099994659424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23985996842384338, "step": 8828 }, { "epoch": 0.1766, "grad_norm": 2.28125, "grad_norm_var": 0.008698527018229167, "learning_rate": 0.0001, "loss": 4.5425, "loss/crossentropy": 2.5017653703689575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26119648665189743, "step": 8830 }, { "epoch": 0.17664, "grad_norm": 2.09375, "grad_norm_var": 0.0090484619140625, "learning_rate": 0.0001, "loss": 4.2474, "loss/crossentropy": 1.9006813764572144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19938994944095612, "step": 8832 }, { "epoch": 0.17668, "grad_norm": 2.046875, "grad_norm_var": 0.009496053059895834, "learning_rate": 0.0001, "loss": 4.1643, "loss/crossentropy": 2.101746916770935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20544035732746124, "step": 8834 }, { "epoch": 0.17672, "grad_norm": 2.15625, "grad_norm_var": 0.010542805989583333, "learning_rate": 0.0001, "loss": 4.3211, "loss/crossentropy": 2.1605160236358643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23061934113502502, "step": 8836 }, { "epoch": 0.17676, "grad_norm": 2.296875, "grad_norm_var": 0.008967081705729166, "learning_rate": 0.0001, "loss": 4.4357, "loss/crossentropy": 1.963772177696228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20736530423164368, "step": 8838 }, { "epoch": 0.1768, "grad_norm": 2.1875, "grad_norm_var": 0.009733072916666667, "learning_rate": 0.0001, "loss": 4.3572, "loss/crossentropy": 2.154300093650818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22221273183822632, "step": 8840 }, { "epoch": 0.17684, "grad_norm": 2.109375, "grad_norm_var": 0.009501139322916666, "learning_rate": 0.0001, "loss": 4.4876, "loss/crossentropy": 2.1576497554779053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21834726631641388, "step": 8842 }, { "epoch": 0.17688, "grad_norm": 1.9375, "grad_norm_var": 0.0115875244140625, "learning_rate": 0.0001, "loss": 4.1192, "loss/crossentropy": 2.1316112279891968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166184037923813, "step": 8844 }, { "epoch": 0.17692, "grad_norm": 2.15625, "grad_norm_var": 0.0098052978515625, "learning_rate": 0.0001, "loss": 4.2421, "loss/crossentropy": 2.068525493144989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23183748871088028, "step": 8846 }, { "epoch": 0.17696, "grad_norm": 2.203125, "grad_norm_var": 0.01051025390625, "learning_rate": 0.0001, "loss": 4.5128, "loss/crossentropy": 2.185767650604248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25881427526474, "step": 8848 }, { "epoch": 0.177, "grad_norm": 2.125, "grad_norm_var": 0.010619099934895833, "learning_rate": 0.0001, "loss": 4.3073, "loss/crossentropy": 1.979454517364502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20669714361429214, "step": 8850 }, { "epoch": 0.17704, "grad_norm": 2.109375, "grad_norm_var": 0.009373982747395834, "learning_rate": 0.0001, "loss": 4.3356, "loss/crossentropy": 2.3473092317581177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23232270777225494, "step": 8852 }, { "epoch": 0.17708, "grad_norm": 2.046875, "grad_norm_var": 0.007616170247395833, "learning_rate": 0.0001, "loss": 4.3732, "loss/crossentropy": 2.461324691772461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.242417573928833, "step": 8854 }, { "epoch": 0.17712, "grad_norm": 2.125, "grad_norm_var": 0.008385976155598959, "learning_rate": 0.0001, "loss": 4.1107, "loss/crossentropy": 1.5953214168548584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17768454551696777, "step": 8856 }, { "epoch": 0.17716, "grad_norm": 2.15625, "grad_norm_var": 0.008377838134765624, "learning_rate": 0.0001, "loss": 4.4289, "loss/crossentropy": 2.1969146728515625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21466109156608582, "step": 8858 }, { "epoch": 0.1772, "grad_norm": 1.921875, "grad_norm_var": 0.008459218343098958, "learning_rate": 0.0001, "loss": 4.2154, "loss/crossentropy": 2.243234634399414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22729554027318954, "step": 8860 }, { "epoch": 0.17724, "grad_norm": 2.296875, "grad_norm_var": 0.010628000895182291, "learning_rate": 0.0001, "loss": 4.2062, "loss/crossentropy": 2.1855397820472717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24905066192150116, "step": 8862 }, { "epoch": 0.17728, "grad_norm": 2.203125, "grad_norm_var": 0.010628000895182291, "learning_rate": 0.0001, "loss": 4.4075, "loss/crossentropy": 2.320886254310608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22135943174362183, "step": 8864 }, { "epoch": 0.17732, "grad_norm": 1.96875, "grad_norm_var": 0.011736806233723958, "learning_rate": 0.0001, "loss": 4.1444, "loss/crossentropy": 2.140891909599304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2109990492463112, "step": 8866 }, { "epoch": 0.17736, "grad_norm": 1.921875, "grad_norm_var": 0.013038889567057291, "learning_rate": 0.0001, "loss": 4.1009, "loss/crossentropy": 2.147824764251709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2131204530596733, "step": 8868 }, { "epoch": 0.1774, "grad_norm": 2.71875, "grad_norm_var": 0.039249420166015625, "learning_rate": 0.0001, "loss": 4.7136, "loss/crossentropy": 2.187807321548462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2335718423128128, "step": 8870 }, { "epoch": 0.17744, "grad_norm": 2.1875, "grad_norm_var": 0.037287394205729164, "learning_rate": 0.0001, "loss": 4.5265, "loss/crossentropy": 2.3127458095550537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2385788857936859, "step": 8872 }, { "epoch": 0.17748, "grad_norm": 2.046875, "grad_norm_var": 0.0369537353515625, "learning_rate": 0.0001, "loss": 4.1364, "loss/crossentropy": 1.859586775302887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204575777053833, "step": 8874 }, { "epoch": 0.17752, "grad_norm": 2.0, "grad_norm_var": 0.03585611979166667, "learning_rate": 0.0001, "loss": 4.1993, "loss/crossentropy": 1.9626107215881348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21016598492860794, "step": 8876 }, { "epoch": 0.17756, "grad_norm": 2.046875, "grad_norm_var": 0.034077962239583336, "learning_rate": 0.0001, "loss": 4.462, "loss/crossentropy": 2.1130539774894714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21995095163583755, "step": 8878 }, { "epoch": 0.1776, "grad_norm": 2.203125, "grad_norm_var": 0.035374959309895836, "learning_rate": 0.0001, "loss": 4.4677, "loss/crossentropy": 1.8914743065834045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20437531173229218, "step": 8880 }, { "epoch": 0.17764, "grad_norm": 2.125, "grad_norm_var": 0.032763671875, "learning_rate": 0.0001, "loss": 4.4975, "loss/crossentropy": 2.2135708332061768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2478664517402649, "step": 8882 }, { "epoch": 0.17768, "grad_norm": 2.203125, "grad_norm_var": 0.028922526041666667, "learning_rate": 0.0001, "loss": 4.3763, "loss/crossentropy": 2.194110333919525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2366471290588379, "step": 8884 }, { "epoch": 0.17772, "grad_norm": 1.9609375, "grad_norm_var": 0.009822336832682292, "learning_rate": 0.0001, "loss": 4.2844, "loss/crossentropy": 2.4365748167037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25842973589897156, "step": 8886 }, { "epoch": 0.17776, "grad_norm": 2.078125, "grad_norm_var": 0.010001373291015626, "learning_rate": 0.0001, "loss": 4.0574, "loss/crossentropy": 1.9800177216529846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21982619166374207, "step": 8888 }, { "epoch": 0.1778, "grad_norm": 2.15625, "grad_norm_var": 0.009956614176432291, "learning_rate": 0.0001, "loss": 4.3465, "loss/crossentropy": 2.1437748670578003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21494000405073166, "step": 8890 }, { "epoch": 0.17784, "grad_norm": 2.1875, "grad_norm_var": 0.008957672119140624, "learning_rate": 0.0001, "loss": 4.4051, "loss/crossentropy": 2.0610267519950867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22473593801259995, "step": 8892 }, { "epoch": 0.17788, "grad_norm": 1.953125, "grad_norm_var": 0.013952382405598958, "learning_rate": 0.0001, "loss": 4.4637, "loss/crossentropy": 2.261958599090576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2250354364514351, "step": 8894 }, { "epoch": 0.17792, "grad_norm": 2.109375, "grad_norm_var": 0.012237294514973959, "learning_rate": 0.0001, "loss": 4.3508, "loss/crossentropy": 2.3689773082733154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22185539454221725, "step": 8896 }, { "epoch": 0.17796, "grad_norm": 2.15625, "grad_norm_var": 0.013034820556640625, "learning_rate": 0.0001, "loss": 4.7432, "loss/crossentropy": 2.612341523170471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2561237961053848, "step": 8898 }, { "epoch": 0.178, "grad_norm": 2.0625, "grad_norm_var": 0.010990142822265625, "learning_rate": 0.0001, "loss": 4.5876, "loss/crossentropy": 2.1230576038360596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23115848749876022, "step": 8900 }, { "epoch": 0.17804, "grad_norm": 2.109375, "grad_norm_var": 0.009626261393229167, "learning_rate": 0.0001, "loss": 4.2154, "loss/crossentropy": 2.200004458427429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22002413868904114, "step": 8902 }, { "epoch": 0.17808, "grad_norm": 2.03125, "grad_norm_var": 0.009989420572916666, "learning_rate": 0.0001, "loss": 4.3438, "loss/crossentropy": 2.323713779449463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22923698276281357, "step": 8904 }, { "epoch": 0.17812, "grad_norm": 2.140625, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 4.3079, "loss/crossentropy": 2.038426458835602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21465667337179184, "step": 8906 }, { "epoch": 0.17816, "grad_norm": 2.109375, "grad_norm_var": 0.01217041015625, "learning_rate": 0.0001, "loss": 4.4498, "loss/crossentropy": 2.3639097213745117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2378860041499138, "step": 8908 }, { "epoch": 0.1782, "grad_norm": 2.046875, "grad_norm_var": 0.00699462890625, "learning_rate": 0.0001, "loss": 4.261, "loss/crossentropy": 1.8291080594062805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19894887506961823, "step": 8910 }, { "epoch": 0.17824, "grad_norm": 2.265625, "grad_norm_var": 0.011263020833333333, "learning_rate": 0.0001, "loss": 4.5606, "loss/crossentropy": 2.3113714456558228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24526391178369522, "step": 8912 }, { "epoch": 0.17828, "grad_norm": 2.109375, "grad_norm_var": 0.011449178059895834, "learning_rate": 0.0001, "loss": 4.3016, "loss/crossentropy": 2.114617943763733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21305133402347565, "step": 8914 }, { "epoch": 0.17832, "grad_norm": 2.15625, "grad_norm_var": 0.014975738525390626, "learning_rate": 0.0001, "loss": 3.9333, "loss/crossentropy": 1.6893808841705322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19195494800806046, "step": 8916 }, { "epoch": 0.17836, "grad_norm": 2.234375, "grad_norm_var": 0.016721343994140624, "learning_rate": 0.0001, "loss": 4.2501, "loss/crossentropy": 1.829396367073059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21181780844926834, "step": 8918 }, { "epoch": 0.1784, "grad_norm": 2.03125, "grad_norm_var": 0.01869481404622396, "learning_rate": 0.0001, "loss": 4.1542, "loss/crossentropy": 1.8910154104232788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20975399017333984, "step": 8920 }, { "epoch": 0.17844, "grad_norm": 2.140625, "grad_norm_var": 0.01953709920247396, "learning_rate": 0.0001, "loss": 4.3327, "loss/crossentropy": 2.0501255989074707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22336408495903015, "step": 8922 }, { "epoch": 0.17848, "grad_norm": 2.109375, "grad_norm_var": 0.01740086873372396, "learning_rate": 0.0001, "loss": 4.3035, "loss/crossentropy": 2.3023892641067505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23251917958259583, "step": 8924 }, { "epoch": 0.17852, "grad_norm": 2.015625, "grad_norm_var": 0.01822077433268229, "learning_rate": 0.0001, "loss": 4.0592, "loss/crossentropy": 2.0030421018600464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19997069239616394, "step": 8926 }, { "epoch": 0.17856, "grad_norm": 1.9921875, "grad_norm_var": 0.013741048177083333, "learning_rate": 0.0001, "loss": 4.2568, "loss/crossentropy": 2.2309017181396484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22145257890224457, "step": 8928 }, { "epoch": 0.1786, "grad_norm": 2.171875, "grad_norm_var": 0.013728841145833334, "learning_rate": 0.0001, "loss": 4.4366, "loss/crossentropy": 2.1135157346725464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21580957621335983, "step": 8930 }, { "epoch": 0.17864, "grad_norm": 2.015625, "grad_norm_var": 0.011146799723307291, "learning_rate": 0.0001, "loss": 4.3473, "loss/crossentropy": 2.098900556564331, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22469021379947662, "step": 8932 }, { "epoch": 0.17868, "grad_norm": 2.21875, "grad_norm_var": 0.0103179931640625, "learning_rate": 0.0001, "loss": 4.2702, "loss/crossentropy": 2.1558337211608887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24765773862600327, "step": 8934 }, { "epoch": 0.17872, "grad_norm": 2.09375, "grad_norm_var": 0.006004842122395834, "learning_rate": 0.0001, "loss": 4.3318, "loss/crossentropy": 2.141040623188019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22490675747394562, "step": 8936 }, { "epoch": 0.17876, "grad_norm": 2.171875, "grad_norm_var": 0.0096099853515625, "learning_rate": 0.0001, "loss": 4.4126, "loss/crossentropy": 2.1036806106567383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23260055482387543, "step": 8938 }, { "epoch": 0.1788, "grad_norm": 1.984375, "grad_norm_var": 0.013036092122395834, "learning_rate": 0.0001, "loss": 4.4151, "loss/crossentropy": 1.9403663277626038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21941428631544113, "step": 8940 }, { "epoch": 0.17884, "grad_norm": 2.03125, "grad_norm_var": 0.014717610677083333, "learning_rate": 0.0001, "loss": 4.2, "loss/crossentropy": 1.8589079976081848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21533852070569992, "step": 8942 }, { "epoch": 0.17888, "grad_norm": 2.21875, "grad_norm_var": 0.013816070556640626, "learning_rate": 0.0001, "loss": 4.2476, "loss/crossentropy": 2.3136903643608093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23139026761054993, "step": 8944 }, { "epoch": 0.17892, "grad_norm": 2.109375, "grad_norm_var": 0.012835439046223958, "learning_rate": 0.0001, "loss": 4.2669, "loss/crossentropy": 2.305663585662842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24263737350702286, "step": 8946 }, { "epoch": 0.17896, "grad_norm": 2.5625, "grad_norm_var": 0.02195002237955729, "learning_rate": 0.0001, "loss": 4.3059, "loss/crossentropy": 2.0450612902641296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124529778957367, "step": 8948 }, { "epoch": 0.179, "grad_norm": 2.3125, "grad_norm_var": 0.020393880208333333, "learning_rate": 0.0001, "loss": 4.4229, "loss/crossentropy": 2.435065984725952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24603386223316193, "step": 8950 }, { "epoch": 0.17904, "grad_norm": 2.234375, "grad_norm_var": 0.02295099894205729, "learning_rate": 0.0001, "loss": 4.2604, "loss/crossentropy": 2.092818021774292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20905248820781708, "step": 8952 }, { "epoch": 0.17908, "grad_norm": 2.109375, "grad_norm_var": 0.024621327718098957, "learning_rate": 0.0001, "loss": 4.2539, "loss/crossentropy": 2.08588969707489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142588049173355, "step": 8954 }, { "epoch": 0.17912, "grad_norm": 2.25, "grad_norm_var": 0.021144358317057292, "learning_rate": 0.0001, "loss": 4.3958, "loss/crossentropy": 1.9161878824234009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20626582205295563, "step": 8956 }, { "epoch": 0.17916, "grad_norm": 2.078125, "grad_norm_var": 0.020182037353515626, "learning_rate": 0.0001, "loss": 4.3726, "loss/crossentropy": 2.3072937726974487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22028075903654099, "step": 8958 }, { "epoch": 0.1792, "grad_norm": 2.15625, "grad_norm_var": 0.021109771728515626, "learning_rate": 0.0001, "loss": 4.4876, "loss/crossentropy": 2.3053938150405884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22212930023670197, "step": 8960 }, { "epoch": 0.17924, "grad_norm": 2.0625, "grad_norm_var": 0.02269261678059896, "learning_rate": 0.0001, "loss": 4.5137, "loss/crossentropy": 1.9130414128303528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20036083459854126, "step": 8962 }, { "epoch": 0.17928, "grad_norm": 2.0, "grad_norm_var": 0.013242340087890625, "learning_rate": 0.0001, "loss": 4.1299, "loss/crossentropy": 2.3808066844940186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2116570845246315, "step": 8964 }, { "epoch": 0.17932, "grad_norm": 2.125, "grad_norm_var": 0.009549713134765625, "learning_rate": 0.0001, "loss": 4.296, "loss/crossentropy": 2.180716395378113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275484874844551, "step": 8966 }, { "epoch": 0.17936, "grad_norm": 1.984375, "grad_norm_var": 0.008837890625, "learning_rate": 0.0001, "loss": 4.2597, "loss/crossentropy": 2.027850866317749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2011229619383812, "step": 8968 }, { "epoch": 0.1794, "grad_norm": 2.171875, "grad_norm_var": 0.017447916666666667, "learning_rate": 0.0001, "loss": 4.3167, "loss/crossentropy": 2.077622890472412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21361806988716125, "step": 8970 }, { "epoch": 0.17944, "grad_norm": 2.03125, "grad_norm_var": 0.017513020833333334, "learning_rate": 0.0001, "loss": 4.1972, "loss/crossentropy": 2.004905104637146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21110886335372925, "step": 8972 }, { "epoch": 0.17948, "grad_norm": 2.21875, "grad_norm_var": 0.019466145833333334, "learning_rate": 0.0001, "loss": 4.5026, "loss/crossentropy": 2.1859925389289856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23011507838964462, "step": 8974 }, { "epoch": 0.17952, "grad_norm": 2.21875, "grad_norm_var": 0.019840494791666666, "learning_rate": 0.0001, "loss": 4.2908, "loss/crossentropy": 1.8372295498847961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171105071902275, "step": 8976 }, { "epoch": 0.17956, "grad_norm": 2.328125, "grad_norm_var": 0.020197550455729168, "learning_rate": 0.0001, "loss": 4.3887, "loss/crossentropy": 2.127313494682312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22017831355333328, "step": 8978 }, { "epoch": 0.1796, "grad_norm": 2.421875, "grad_norm_var": 0.022098795572916666, "learning_rate": 0.0001, "loss": 4.3389, "loss/crossentropy": 2.121580421924591, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24898843467235565, "step": 8980 }, { "epoch": 0.17964, "grad_norm": 2.109375, "grad_norm_var": 0.0226470947265625, "learning_rate": 0.0001, "loss": 4.1127, "loss/crossentropy": 2.0973563194274902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21355029940605164, "step": 8982 }, { "epoch": 0.17968, "grad_norm": 2.125, "grad_norm_var": 0.018896484375, "learning_rate": 0.0001, "loss": 4.3898, "loss/crossentropy": 2.1109927892684937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2096879929304123, "step": 8984 }, { "epoch": 0.17972, "grad_norm": 2.078125, "grad_norm_var": 0.0126617431640625, "learning_rate": 0.0001, "loss": 4.1264, "loss/crossentropy": 1.954129159450531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20745816081762314, "step": 8986 }, { "epoch": 0.17976, "grad_norm": 2.390625, "grad_norm_var": 0.013483683268229166, "learning_rate": 0.0001, "loss": 4.5357, "loss/crossentropy": 1.875806748867035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19150983542203903, "step": 8988 }, { "epoch": 0.1798, "grad_norm": 2.109375, "grad_norm_var": 0.01451416015625, "learning_rate": 0.0001, "loss": 4.3042, "loss/crossentropy": 2.3101617097854614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2393202781677246, "step": 8990 }, { "epoch": 0.17984, "grad_norm": 2.234375, "grad_norm_var": 0.013374837239583333, "learning_rate": 0.0001, "loss": 4.4954, "loss/crossentropy": 2.3156551122665405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23346291482448578, "step": 8992 }, { "epoch": 0.17988, "grad_norm": 2.1875, "grad_norm_var": 0.013966623942057292, "learning_rate": 0.0001, "loss": 4.3675, "loss/crossentropy": 2.4437999725341797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172817811369896, "step": 8994 }, { "epoch": 0.17992, "grad_norm": 2.046875, "grad_norm_var": 0.009437815348307291, "learning_rate": 0.0001, "loss": 4.3122, "loss/crossentropy": 2.0451250076293945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22556670010089874, "step": 8996 }, { "epoch": 0.17996, "grad_norm": 2.21875, "grad_norm_var": 0.009852854410807292, "learning_rate": 0.0001, "loss": 4.5654, "loss/crossentropy": 2.3135393857955933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2313593551516533, "step": 8998 }, { "epoch": 0.18, "grad_norm": 2.125, "grad_norm_var": 0.010530344645182292, "learning_rate": 0.0001, "loss": 4.3001, "loss/crossentropy": 1.9934805035591125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20815817266702652, "step": 9000 }, { "epoch": 0.18004, "grad_norm": 2.125, "grad_norm_var": 0.011982981363932292, "learning_rate": 0.0001, "loss": 4.6193, "loss/crossentropy": 2.1770662665367126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23489046096801758, "step": 9002 }, { "epoch": 0.18008, "grad_norm": 2.3125, "grad_norm_var": 0.009779612223307291, "learning_rate": 0.0001, "loss": 4.3708, "loss/crossentropy": 1.9791364073753357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139936238527298, "step": 9004 }, { "epoch": 0.18012, "grad_norm": 2.015625, "grad_norm_var": 0.011445871988932292, "learning_rate": 0.0001, "loss": 4.298, "loss/crossentropy": 2.2092931270599365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22620443254709244, "step": 9006 }, { "epoch": 0.18016, "grad_norm": 2.171875, "grad_norm_var": 0.011034901936848958, "learning_rate": 0.0001, "loss": 4.3114, "loss/crossentropy": 2.123443365097046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21943332999944687, "step": 9008 }, { "epoch": 0.1802, "grad_norm": 2.09375, "grad_norm_var": 0.009212239583333334, "learning_rate": 0.0001, "loss": 4.4666, "loss/crossentropy": 2.243329405784607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24108020961284637, "step": 9010 }, { "epoch": 0.18024, "grad_norm": 2.1875, "grad_norm_var": 0.008426920572916666, "learning_rate": 0.0001, "loss": 4.3687, "loss/crossentropy": 2.366227388381958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21806316077709198, "step": 9012 }, { "epoch": 0.18028, "grad_norm": 2.03125, "grad_norm_var": 0.009663899739583334, "learning_rate": 0.0001, "loss": 3.8255, "loss/crossentropy": 1.768812358379364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1861925944685936, "step": 9014 }, { "epoch": 0.18032, "grad_norm": 1.9296875, "grad_norm_var": 0.012225087483723958, "learning_rate": 0.0001, "loss": 4.1236, "loss/crossentropy": 1.9376537799835205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18691913783550262, "step": 9016 }, { "epoch": 0.18036, "grad_norm": 2.140625, "grad_norm_var": 0.010709381103515625, "learning_rate": 0.0001, "loss": 4.2748, "loss/crossentropy": 2.3026299476623535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23757526278495789, "step": 9018 }, { "epoch": 0.1804, "grad_norm": 1.9453125, "grad_norm_var": 0.010057576497395833, "learning_rate": 0.0001, "loss": 4.0026, "loss/crossentropy": 1.9697216153144836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21453910320997238, "step": 9020 }, { "epoch": 0.18044, "grad_norm": 2.0625, "grad_norm_var": 0.011058553059895834, "learning_rate": 0.0001, "loss": 4.5457, "loss/crossentropy": 2.257638931274414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24236120283603668, "step": 9022 }, { "epoch": 0.18048, "grad_norm": 2.0625, "grad_norm_var": 0.0115142822265625, "learning_rate": 0.0001, "loss": 4.3323, "loss/crossentropy": 2.244320869445801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25429578125476837, "step": 9024 }, { "epoch": 0.18052, "grad_norm": 2.21875, "grad_norm_var": 0.012214152018229167, "learning_rate": 0.0001, "loss": 4.2854, "loss/crossentropy": 2.1105872988700867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23081901669502258, "step": 9026 }, { "epoch": 0.18056, "grad_norm": 1.9921875, "grad_norm_var": 0.012839508056640626, "learning_rate": 0.0001, "loss": 4.2561, "loss/crossentropy": 1.9657647609710693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22111424803733826, "step": 9028 }, { "epoch": 0.1806, "grad_norm": 2.125, "grad_norm_var": 0.012308502197265625, "learning_rate": 0.0001, "loss": 4.4019, "loss/crossentropy": 2.0759438276290894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22763221710920334, "step": 9030 }, { "epoch": 0.18064, "grad_norm": 2.125, "grad_norm_var": 0.010888417561848959, "learning_rate": 0.0001, "loss": 3.9999, "loss/crossentropy": 2.0250572562217712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2279352843761444, "step": 9032 }, { "epoch": 0.18068, "grad_norm": 2.09375, "grad_norm_var": 0.010534413655598958, "learning_rate": 0.0001, "loss": 4.3772, "loss/crossentropy": 2.270031213760376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23242096602916718, "step": 9034 }, { "epoch": 0.18072, "grad_norm": 2.25, "grad_norm_var": 0.011922200520833334, "learning_rate": 0.0001, "loss": 4.4304, "loss/crossentropy": 2.10041344165802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23192601650953293, "step": 9036 }, { "epoch": 0.18076, "grad_norm": 2.15625, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.1041, "loss/crossentropy": 2.0255953073501587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23707614094018936, "step": 9038 }, { "epoch": 0.1808, "grad_norm": 1.9453125, "grad_norm_var": 0.010406239827473959, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 1.8162729740142822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19466694444417953, "step": 9040 }, { "epoch": 0.18084, "grad_norm": 2.203125, "grad_norm_var": 0.010170237223307291, "learning_rate": 0.0001, "loss": 4.5543, "loss/crossentropy": 2.1271599531173706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22750889509916306, "step": 9042 }, { "epoch": 0.18088, "grad_norm": 2.171875, "grad_norm_var": 0.009447224934895833, "learning_rate": 0.0001, "loss": 4.5377, "loss/crossentropy": 2.3638603687286377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2456393539905548, "step": 9044 }, { "epoch": 0.18092, "grad_norm": 2.125, "grad_norm_var": 0.019437662760416665, "learning_rate": 0.0001, "loss": 4.3413, "loss/crossentropy": 1.5851669907569885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20994101464748383, "step": 9046 }, { "epoch": 0.18096, "grad_norm": 2.1875, "grad_norm_var": 0.017789459228515624, "learning_rate": 0.0001, "loss": 4.2284, "loss/crossentropy": 1.7990906834602356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18919725716114044, "step": 9048 }, { "epoch": 0.181, "grad_norm": 2.28125, "grad_norm_var": 0.017490386962890625, "learning_rate": 0.0001, "loss": 4.5415, "loss/crossentropy": 2.0975595712661743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23837832361459732, "step": 9050 }, { "epoch": 0.18104, "grad_norm": 2.0625, "grad_norm_var": 0.018070475260416666, "learning_rate": 0.0001, "loss": 4.1304, "loss/crossentropy": 2.1970856189727783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23219536244869232, "step": 9052 }, { "epoch": 0.18108, "grad_norm": 2.15625, "grad_norm_var": 0.017292277018229166, "learning_rate": 0.0001, "loss": 4.211, "loss/crossentropy": 2.0846009850502014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20775136351585388, "step": 9054 }, { "epoch": 0.18112, "grad_norm": 2.0625, "grad_norm_var": 0.015075429280598959, "learning_rate": 0.0001, "loss": 4.0415, "loss/crossentropy": 1.662496030330658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18223516643047333, "step": 9056 }, { "epoch": 0.18116, "grad_norm": 2.046875, "grad_norm_var": 0.016078440348307292, "learning_rate": 0.0001, "loss": 4.2561, "loss/crossentropy": 2.126902401447296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193778082728386, "step": 9058 }, { "epoch": 0.1812, "grad_norm": 2.984375, "grad_norm_var": 0.062459309895833336, "learning_rate": 0.0001, "loss": 4.0343, "loss/crossentropy": 1.9529814720153809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19026879966259003, "step": 9060 }, { "epoch": 0.18124, "grad_norm": 1.953125, "grad_norm_var": 0.05981852213541667, "learning_rate": 0.0001, "loss": 4.1911, "loss/crossentropy": 2.140450179576874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23864319920539856, "step": 9062 }, { "epoch": 0.18128, "grad_norm": 2.015625, "grad_norm_var": 0.06083882649739583, "learning_rate": 0.0001, "loss": 4.3442, "loss/crossentropy": 2.4139195680618286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2360195592045784, "step": 9064 }, { "epoch": 0.18132, "grad_norm": 2.046875, "grad_norm_var": 0.06083882649739583, "learning_rate": 0.0001, "loss": 4.5122, "loss/crossentropy": 2.190356135368347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22671552002429962, "step": 9066 }, { "epoch": 0.18136, "grad_norm": 2.078125, "grad_norm_var": 0.05916315714518229, "learning_rate": 0.0001, "loss": 4.281, "loss/crossentropy": 1.9428812861442566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22318705916404724, "step": 9068 }, { "epoch": 0.1814, "grad_norm": 2.046875, "grad_norm_var": 0.059242502848307295, "learning_rate": 0.0001, "loss": 4.4628, "loss/crossentropy": 2.296473503112793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22005227208137512, "step": 9070 }, { "epoch": 0.18144, "grad_norm": 2.1875, "grad_norm_var": 0.05919774373372396, "learning_rate": 0.0001, "loss": 4.5848, "loss/crossentropy": 2.2118901014328003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22189343720674515, "step": 9072 }, { "epoch": 0.18148, "grad_norm": 2.15625, "grad_norm_var": 0.05810114542643229, "learning_rate": 0.0001, "loss": 4.285, "loss/crossentropy": 1.8919037580490112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1986190229654312, "step": 9074 }, { "epoch": 0.18152, "grad_norm": 2.171875, "grad_norm_var": 0.011161295572916667, "learning_rate": 0.0001, "loss": 4.3046, "loss/crossentropy": 1.975312054157257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20403321832418442, "step": 9076 }, { "epoch": 0.18156, "grad_norm": 2.265625, "grad_norm_var": 0.0075032552083333336, "learning_rate": 0.0001, "loss": 4.2175, "loss/crossentropy": 1.8076966404914856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19708115607500076, "step": 9078 }, { "epoch": 0.1816, "grad_norm": 2.265625, "grad_norm_var": 0.007112630208333333, "learning_rate": 0.0001, "loss": 4.2166, "loss/crossentropy": 2.101171374320984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22123181074857712, "step": 9080 }, { "epoch": 0.18164, "grad_norm": 2.1875, "grad_norm_var": 0.007225545247395834, "learning_rate": 0.0001, "loss": 4.2292, "loss/crossentropy": 2.09942090511322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21761803328990936, "step": 9082 }, { "epoch": 0.18168, "grad_norm": 2.1875, "grad_norm_var": 0.008072916666666667, "learning_rate": 0.0001, "loss": 4.4802, "loss/crossentropy": 2.4418424367904663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24908769130706787, "step": 9084 }, { "epoch": 0.18172, "grad_norm": 2.109375, "grad_norm_var": 0.0087890625, "learning_rate": 0.0001, "loss": 4.3408, "loss/crossentropy": 1.944950520992279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20515284687280655, "step": 9086 }, { "epoch": 0.18176, "grad_norm": 2.21875, "grad_norm_var": 0.007991536458333334, "learning_rate": 0.0001, "loss": 4.3117, "loss/crossentropy": 1.870418667793274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20427027344703674, "step": 9088 }, { "epoch": 0.1818, "grad_norm": 2.171875, "grad_norm_var": 0.007682291666666666, "learning_rate": 0.0001, "loss": 4.1323, "loss/crossentropy": 1.9338520169258118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19928501546382904, "step": 9090 }, { "epoch": 0.18184, "grad_norm": 1.9609375, "grad_norm_var": 0.008876291910807292, "learning_rate": 0.0001, "loss": 3.9734, "loss/crossentropy": 1.7826221585273743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19778436422348022, "step": 9092 }, { "epoch": 0.18188, "grad_norm": 2.078125, "grad_norm_var": 0.007458241780598959, "learning_rate": 0.0001, "loss": 4.3537, "loss/crossentropy": 2.2922680377960205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23613491654396057, "step": 9094 }, { "epoch": 0.18192, "grad_norm": 2.296875, "grad_norm_var": 0.15102513631184897, "learning_rate": 0.0001, "loss": 4.6789, "loss/crossentropy": 2.1802788972854614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969536542892456, "step": 9096 }, { "epoch": 0.18196, "grad_norm": 2.15625, "grad_norm_var": 0.1498308817545573, "learning_rate": 0.0001, "loss": 4.1715, "loss/crossentropy": 1.9129992723464966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1942882016301155, "step": 9098 }, { "epoch": 0.182, "grad_norm": 2.203125, "grad_norm_var": 0.14738337198893228, "learning_rate": 0.0001, "loss": 4.3374, "loss/crossentropy": 1.8288249969482422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19879616051912308, "step": 9100 }, { "epoch": 0.18204, "grad_norm": 2.3125, "grad_norm_var": 0.14580663045247397, "learning_rate": 0.0001, "loss": 4.2824, "loss/crossentropy": 2.039812684059143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21374420076608658, "step": 9102 }, { "epoch": 0.18208, "grad_norm": 1.9609375, "grad_norm_var": 0.15449193318684895, "learning_rate": 0.0001, "loss": 4.1862, "loss/crossentropy": 2.1177414059638977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193453460931778, "step": 9104 }, { "epoch": 0.18212, "grad_norm": 2.28125, "grad_norm_var": 0.16704076131184895, "learning_rate": 0.0001, "loss": 4.4869, "loss/crossentropy": 2.171097159385681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21211445331573486, "step": 9106 }, { "epoch": 0.18216, "grad_norm": 2.359375, "grad_norm_var": 0.16413548787434895, "learning_rate": 0.0001, "loss": 4.1422, "loss/crossentropy": 1.9781638383865356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21307373046875, "step": 9108 }, { "epoch": 0.1822, "grad_norm": 1.984375, "grad_norm_var": 0.17157363891601562, "learning_rate": 0.0001, "loss": 4.2454, "loss/crossentropy": 2.0868560075759888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20633937418460846, "step": 9110 }, { "epoch": 0.18224, "grad_norm": 2.03125, "grad_norm_var": 0.04592463175455729, "learning_rate": 0.0001, "loss": 4.1153, "loss/crossentropy": 2.079172134399414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20709815621376038, "step": 9112 }, { "epoch": 0.18228, "grad_norm": 2.359375, "grad_norm_var": 0.04835383097330729, "learning_rate": 0.0001, "loss": 4.1836, "loss/crossentropy": 2.100913643836975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21831409633159637, "step": 9114 }, { "epoch": 0.18232, "grad_norm": 2.203125, "grad_norm_var": 0.049478912353515626, "learning_rate": 0.0001, "loss": 4.4087, "loss/crossentropy": 2.018395781517029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22545243054628372, "step": 9116 }, { "epoch": 0.18236, "grad_norm": 2.203125, "grad_norm_var": 0.04278132120768229, "learning_rate": 0.0001, "loss": 4.3421, "loss/crossentropy": 1.9177632331848145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23346271365880966, "step": 9118 }, { "epoch": 0.1824, "grad_norm": 2.0625, "grad_norm_var": 0.038358306884765624, "learning_rate": 0.0001, "loss": 3.9924, "loss/crossentropy": 2.1122357845306396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22593770176172256, "step": 9120 }, { "epoch": 0.18244, "grad_norm": 2.03125, "grad_norm_var": 0.014021555582682291, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 1.7920495867729187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127150148153305, "step": 9122 }, { "epoch": 0.18248, "grad_norm": 2.1875, "grad_norm_var": 0.0096099853515625, "learning_rate": 0.0001, "loss": 4.2061, "loss/crossentropy": 2.2180920839309692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25088224560022354, "step": 9124 }, { "epoch": 0.18252, "grad_norm": 2.1875, "grad_norm_var": 0.008185831705729167, "learning_rate": 0.0001, "loss": 4.3287, "loss/crossentropy": 1.8999969959259033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21616832166910172, "step": 9126 }, { "epoch": 0.18256, "grad_norm": 2.125, "grad_norm_var": 0.0081451416015625, "learning_rate": 0.0001, "loss": 4.5351, "loss/crossentropy": 1.9424527287483215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22016742825508118, "step": 9128 }, { "epoch": 0.1826, "grad_norm": 2.0625, "grad_norm_var": 0.0049468994140625, "learning_rate": 0.0001, "loss": 4.4432, "loss/crossentropy": 2.0539366006851196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26818516850471497, "step": 9130 }, { "epoch": 0.18264, "grad_norm": 2.0625, "grad_norm_var": 0.00458984375, "learning_rate": 0.0001, "loss": 4.1634, "loss/crossentropy": 1.9180519580841064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067112922668457, "step": 9132 }, { "epoch": 0.18268, "grad_norm": 2.078125, "grad_norm_var": 0.005052693684895833, "learning_rate": 0.0001, "loss": 3.9913, "loss/crossentropy": 1.7417545318603516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1970825269818306, "step": 9134 }, { "epoch": 0.18272, "grad_norm": 2.09375, "grad_norm_var": 0.004833984375, "learning_rate": 0.0001, "loss": 4.016, "loss/crossentropy": 1.9498217701911926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21281737089157104, "step": 9136 }, { "epoch": 0.18276, "grad_norm": 2.21875, "grad_norm_var": 0.005615234375, "learning_rate": 0.0001, "loss": 4.3842, "loss/crossentropy": 2.140692949295044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24291902035474777, "step": 9138 }, { "epoch": 0.1828, "grad_norm": 1.984375, "grad_norm_var": 0.0069976806640625, "learning_rate": 0.0001, "loss": 4.1785, "loss/crossentropy": 2.3510342836380005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23326712846755981, "step": 9140 }, { "epoch": 0.18284, "grad_norm": 2.140625, "grad_norm_var": 0.0072743733723958336, "learning_rate": 0.0001, "loss": 4.0707, "loss/crossentropy": 2.049591898918152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21118033677339554, "step": 9142 }, { "epoch": 0.18288, "grad_norm": 2.109375, "grad_norm_var": 0.005631510416666667, "learning_rate": 0.0001, "loss": 4.263, "loss/crossentropy": 1.949703335762024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22161198407411575, "step": 9144 }, { "epoch": 0.18292, "grad_norm": 2.109375, "grad_norm_var": 0.005399576822916667, "learning_rate": 0.0001, "loss": 4.5602, "loss/crossentropy": 2.1442413330078125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21281076222658157, "step": 9146 }, { "epoch": 0.18296, "grad_norm": 2.078125, "grad_norm_var": 0.007111612955729167, "learning_rate": 0.0001, "loss": 4.0848, "loss/crossentropy": 2.103494882583618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20892268419265747, "step": 9148 }, { "epoch": 0.183, "grad_norm": 2.25, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 4.133, "loss/crossentropy": 2.1544495224952698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008143737912178, "step": 9150 }, { "epoch": 0.18304, "grad_norm": 1.984375, "grad_norm_var": 0.01109619140625, "learning_rate": 0.0001, "loss": 4.2799, "loss/crossentropy": 2.021821677684784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20975126326084137, "step": 9152 }, { "epoch": 0.18308, "grad_norm": 1.921875, "grad_norm_var": 0.0102203369140625, "learning_rate": 0.0001, "loss": 4.2858, "loss/crossentropy": 2.109215199947357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19855067878961563, "step": 9154 }, { "epoch": 0.18312, "grad_norm": 2.09375, "grad_norm_var": 0.011165364583333334, "learning_rate": 0.0001, "loss": 4.2181, "loss/crossentropy": 1.7631941437721252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20584283769130707, "step": 9156 }, { "epoch": 0.18316, "grad_norm": 2.171875, "grad_norm_var": 0.0116851806640625, "learning_rate": 0.0001, "loss": 4.4005, "loss/crossentropy": 2.131524443626404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151477411389351, "step": 9158 }, { "epoch": 0.1832, "grad_norm": 2.015625, "grad_norm_var": 0.01334228515625, "learning_rate": 0.0001, "loss": 4.2729, "loss/crossentropy": 2.018375277519226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21982873231172562, "step": 9160 }, { "epoch": 0.18324, "grad_norm": 2.09375, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 4.3302, "loss/crossentropy": 2.217617154121399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2257251739501953, "step": 9162 }, { "epoch": 0.18328, "grad_norm": 2.15625, "grad_norm_var": 0.014606730143229166, "learning_rate": 0.0001, "loss": 4.227, "loss/crossentropy": 1.8632460832595825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19728046655654907, "step": 9164 }, { "epoch": 0.18332, "grad_norm": 2.0625, "grad_norm_var": 0.012137858072916667, "learning_rate": 0.0001, "loss": 4.5335, "loss/crossentropy": 2.2818111181259155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22397568821907043, "step": 9166 }, { "epoch": 0.18336, "grad_norm": 2.109375, "grad_norm_var": 0.01177978515625, "learning_rate": 0.0001, "loss": 4.4104, "loss/crossentropy": 2.1209938526153564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22490206360816956, "step": 9168 }, { "epoch": 0.1834, "grad_norm": 2.046875, "grad_norm_var": 0.00924072265625, "learning_rate": 0.0001, "loss": 4.244, "loss/crossentropy": 2.141623795032501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21710850298404694, "step": 9170 }, { "epoch": 0.18344, "grad_norm": 2.125, "grad_norm_var": 0.007323201497395833, "learning_rate": 0.0001, "loss": 4.2063, "loss/crossentropy": 2.165239691734314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.235738106071949, "step": 9172 }, { "epoch": 0.18348, "grad_norm": 2.015625, "grad_norm_var": 0.008103179931640624, "learning_rate": 0.0001, "loss": 4.0186, "loss/crossentropy": 1.8649475574493408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088451236486435, "step": 9174 }, { "epoch": 0.18352, "grad_norm": 2.15625, "grad_norm_var": 0.007012685139973958, "learning_rate": 0.0001, "loss": 4.2853, "loss/crossentropy": 2.312218189239502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.236283540725708, "step": 9176 }, { "epoch": 0.18356, "grad_norm": 2.109375, "grad_norm_var": 0.008699544270833333, "learning_rate": 0.0001, "loss": 3.9626, "loss/crossentropy": 2.0149282217025757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21434535831212997, "step": 9178 }, { "epoch": 0.1836, "grad_norm": 2.125, "grad_norm_var": 0.0086822509765625, "learning_rate": 0.0001, "loss": 4.5081, "loss/crossentropy": 2.499966621398926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26318275928497314, "step": 9180 }, { "epoch": 0.18364, "grad_norm": 2.140625, "grad_norm_var": 0.0084136962890625, "learning_rate": 0.0001, "loss": 4.4498, "loss/crossentropy": 2.0454984307289124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202616035938263, "step": 9182 }, { "epoch": 0.18368, "grad_norm": 1.9609375, "grad_norm_var": 0.010835520426432292, "learning_rate": 0.0001, "loss": 4.0754, "loss/crossentropy": 2.1708725094795227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20958629250526428, "step": 9184 }, { "epoch": 0.18372, "grad_norm": 2.140625, "grad_norm_var": 0.010792795817057292, "learning_rate": 0.0001, "loss": 4.2104, "loss/crossentropy": 1.8261350989341736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1952563151717186, "step": 9186 }, { "epoch": 0.18376, "grad_norm": 2.046875, "grad_norm_var": 0.010009511311848959, "learning_rate": 0.0001, "loss": 4.1247, "loss/crossentropy": 2.036627769470215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22490298002958298, "step": 9188 }, { "epoch": 0.1838, "grad_norm": 1.9765625, "grad_norm_var": 0.010501861572265625, "learning_rate": 0.0001, "loss": 4.0088, "loss/crossentropy": 1.7977086305618286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19282807409763336, "step": 9190 }, { "epoch": 0.18384, "grad_norm": 2.21875, "grad_norm_var": 0.011211903889973958, "learning_rate": 0.0001, "loss": 4.1774, "loss/crossentropy": 2.170135021209717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23149186372756958, "step": 9192 }, { "epoch": 0.18388, "grad_norm": 2.328125, "grad_norm_var": 0.012035115559895834, "learning_rate": 0.0001, "loss": 4.4155, "loss/crossentropy": 2.1453020572662354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25120319426059723, "step": 9194 }, { "epoch": 0.18392, "grad_norm": 2.125, "grad_norm_var": 0.01224365234375, "learning_rate": 0.0001, "loss": 4.3628, "loss/crossentropy": 1.8794063925743103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20883548259735107, "step": 9196 }, { "epoch": 0.18396, "grad_norm": 2.3125, "grad_norm_var": 0.0141998291015625, "learning_rate": 0.0001, "loss": 4.3384, "loss/crossentropy": 2.021254241466522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21939975768327713, "step": 9198 }, { "epoch": 0.184, "grad_norm": 2.09375, "grad_norm_var": 0.009557851155598958, "learning_rate": 0.0001, "loss": 4.158, "loss/crossentropy": 2.195580303668976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108924463391304, "step": 9200 }, { "epoch": 0.18404, "grad_norm": 2.21875, "grad_norm_var": 0.014085896809895833, "learning_rate": 0.0001, "loss": 4.0781, "loss/crossentropy": 1.6260902881622314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20428159832954407, "step": 9202 }, { "epoch": 0.18408, "grad_norm": 1.96875, "grad_norm_var": 0.015363566080729167, "learning_rate": 0.0001, "loss": 4.2525, "loss/crossentropy": 2.138678550720215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21641074120998383, "step": 9204 }, { "epoch": 0.18412, "grad_norm": 2.125, "grad_norm_var": 0.01825129191080729, "learning_rate": 0.0001, "loss": 4.4706, "loss/crossentropy": 2.047194480895996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20891964435577393, "step": 9206 }, { "epoch": 0.18416, "grad_norm": 2.171875, "grad_norm_var": 0.019606272379557293, "learning_rate": 0.0001, "loss": 4.2312, "loss/crossentropy": 2.189277768135071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150397077202797, "step": 9208 }, { "epoch": 0.1842, "grad_norm": 2.140625, "grad_norm_var": 0.017618560791015626, "learning_rate": 0.0001, "loss": 4.2792, "loss/crossentropy": 2.184122920036316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23278063535690308, "step": 9210 }, { "epoch": 0.18424, "grad_norm": 1.9921875, "grad_norm_var": 0.019139607747395832, "learning_rate": 0.0001, "loss": 4.3471, "loss/crossentropy": 2.413718104362488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24333150684833527, "step": 9212 }, { "epoch": 0.18428, "grad_norm": 2.078125, "grad_norm_var": 0.019017537434895832, "learning_rate": 0.0001, "loss": 3.8486, "loss/crossentropy": 1.8086814880371094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19976364076137543, "step": 9214 }, { "epoch": 0.18432, "grad_norm": 2.09375, "grad_norm_var": 0.019978841145833332, "learning_rate": 0.0001, "loss": 4.5222, "loss/crossentropy": 2.2418206930160522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.229690782725811, "step": 9216 }, { "epoch": 0.18436, "grad_norm": 2.015625, "grad_norm_var": 0.016355133056640624, "learning_rate": 0.0001, "loss": 4.0817, "loss/crossentropy": 1.8083258867263794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20999443531036377, "step": 9218 }, { "epoch": 0.1844, "grad_norm": 1.953125, "grad_norm_var": 0.01693115234375, "learning_rate": 0.0001, "loss": 3.772, "loss/crossentropy": 1.8117709755897522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20436270534992218, "step": 9220 }, { "epoch": 0.18444, "grad_norm": 2.15625, "grad_norm_var": 0.010550944010416667, "learning_rate": 0.0001, "loss": 4.2149, "loss/crossentropy": 2.024270534515381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20097267627716064, "step": 9222 }, { "epoch": 0.18448, "grad_norm": 2.25, "grad_norm_var": 0.011449178059895834, "learning_rate": 0.0001, "loss": 4.4756, "loss/crossentropy": 2.2385981678962708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21131044626235962, "step": 9224 }, { "epoch": 0.18452, "grad_norm": 2.140625, "grad_norm_var": 0.011271158854166666, "learning_rate": 0.0001, "loss": 4.3745, "loss/crossentropy": 2.127749502658844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22855369001626968, "step": 9226 }, { "epoch": 0.18456, "grad_norm": 2.15625, "grad_norm_var": 0.011237589518229167, "learning_rate": 0.0001, "loss": 4.2645, "loss/crossentropy": 2.1877033710479736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2339663878083229, "step": 9228 }, { "epoch": 0.1846, "grad_norm": 2.25, "grad_norm_var": 0.011139933268229167, "learning_rate": 0.0001, "loss": 4.2642, "loss/crossentropy": 1.931507408618927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20912021398544312, "step": 9230 }, { "epoch": 0.18464, "grad_norm": 2.234375, "grad_norm_var": 0.011579386393229167, "learning_rate": 0.0001, "loss": 4.3309, "loss/crossentropy": 1.8101251125335693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23724676668643951, "step": 9232 }, { "epoch": 0.18468, "grad_norm": 2.125, "grad_norm_var": 0.0111480712890625, "learning_rate": 0.0001, "loss": 4.201, "loss/crossentropy": 2.015208065509796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23482007533311844, "step": 9234 }, { "epoch": 0.18472, "grad_norm": 2.1875, "grad_norm_var": 0.010282135009765625, "learning_rate": 0.0001, "loss": 4.5357, "loss/crossentropy": 2.2742738723754883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23632052540779114, "step": 9236 }, { "epoch": 0.18476, "grad_norm": 2.46875, "grad_norm_var": 0.016806793212890626, "learning_rate": 0.0001, "loss": 4.3574, "loss/crossentropy": 1.7254774570465088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104022428393364, "step": 9238 }, { "epoch": 0.1848, "grad_norm": 2.09375, "grad_norm_var": 0.016585032145182293, "learning_rate": 0.0001, "loss": 4.3477, "loss/crossentropy": 2.181770443916321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22411521524190903, "step": 9240 }, { "epoch": 0.18484, "grad_norm": 1.953125, "grad_norm_var": 0.020157877604166666, "learning_rate": 0.0001, "loss": 3.9517, "loss/crossentropy": 1.8449691534042358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2091139778494835, "step": 9242 }, { "epoch": 0.18488, "grad_norm": 2.1875, "grad_norm_var": 0.017195383707682293, "learning_rate": 0.0001, "loss": 4.3389, "loss/crossentropy": 2.0917609333992004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2644127458333969, "step": 9244 }, { "epoch": 0.18492, "grad_norm": 2.28125, "grad_norm_var": 0.01793390909830729, "learning_rate": 0.0001, "loss": 4.083, "loss/crossentropy": 2.060012102127075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21294714510440826, "step": 9246 }, { "epoch": 0.18496, "grad_norm": 2.078125, "grad_norm_var": 0.018387603759765624, "learning_rate": 0.0001, "loss": 4.6598, "loss/crossentropy": 2.059940278530121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22648683190345764, "step": 9248 }, { "epoch": 0.185, "grad_norm": 2.078125, "grad_norm_var": 0.01862360636393229, "learning_rate": 0.0001, "loss": 4.4696, "loss/crossentropy": 1.8423291444778442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22509171068668365, "step": 9250 }, { "epoch": 0.18504, "grad_norm": 2.125, "grad_norm_var": 0.016410064697265626, "learning_rate": 0.0001, "loss": 4.4629, "loss/crossentropy": 2.2559698820114136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23896963894367218, "step": 9252 }, { "epoch": 0.18508, "grad_norm": 2.03125, "grad_norm_var": 0.01579767862955729, "learning_rate": 0.0001, "loss": 4.6027, "loss/crossentropy": 2.0583658814430237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22763275355100632, "step": 9254 }, { "epoch": 0.18512, "grad_norm": 2.015625, "grad_norm_var": 0.016658274332682292, "learning_rate": 0.0001, "loss": 4.2654, "loss/crossentropy": 1.816649854183197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18608735501766205, "step": 9256 }, { "epoch": 0.18516, "grad_norm": 4.65625, "grad_norm_var": 0.4073150634765625, "learning_rate": 0.0001, "loss": 4.2837, "loss/crossentropy": 2.0920958518981934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22859029471874237, "step": 9258 }, { "epoch": 0.1852, "grad_norm": 2.125, "grad_norm_var": 0.4108306884765625, "learning_rate": 0.0001, "loss": 4.2517, "loss/crossentropy": 2.1475982666015625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21678777784109116, "step": 9260 }, { "epoch": 0.18524, "grad_norm": 2.21875, "grad_norm_var": 0.4100901285807292, "learning_rate": 0.0001, "loss": 4.4258, "loss/crossentropy": 2.22346031665802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2333592176437378, "step": 9262 }, { "epoch": 0.18528, "grad_norm": 2.09375, "grad_norm_var": 0.41646703084309894, "learning_rate": 0.0001, "loss": 4.2134, "loss/crossentropy": 1.7652028799057007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18461769074201584, "step": 9264 }, { "epoch": 0.18532, "grad_norm": 2.109375, "grad_norm_var": 0.4225006103515625, "learning_rate": 0.0001, "loss": 4.2362, "loss/crossentropy": 2.1549625396728516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22428707033395767, "step": 9266 }, { "epoch": 0.18536, "grad_norm": 2.0, "grad_norm_var": 0.4281972249348958, "learning_rate": 0.0001, "loss": 4.3685, "loss/crossentropy": 2.1677842140197754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22887174785137177, "step": 9268 }, { "epoch": 0.1854, "grad_norm": 2.09375, "grad_norm_var": 0.42428792317708336, "learning_rate": 0.0001, "loss": 4.1517, "loss/crossentropy": 1.8352131247520447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19628287106752396, "step": 9270 }, { "epoch": 0.18544, "grad_norm": 2.109375, "grad_norm_var": 0.43038304646809894, "learning_rate": 0.0001, "loss": 4.101, "loss/crossentropy": 2.104279100894928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20481227338314056, "step": 9272 }, { "epoch": 0.18548, "grad_norm": 2.21875, "grad_norm_var": 0.009795888264973959, "learning_rate": 0.0001, "loss": 4.2011, "loss/crossentropy": 1.8447301387786865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2227916270494461, "step": 9274 }, { "epoch": 0.18552, "grad_norm": 2.296875, "grad_norm_var": 0.013651275634765625, "learning_rate": 0.0001, "loss": 4.3544, "loss/crossentropy": 2.303207755088806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24895529448986053, "step": 9276 }, { "epoch": 0.18556, "grad_norm": 2.171875, "grad_norm_var": 0.014115142822265624, "learning_rate": 0.0001, "loss": 4.699, "loss/crossentropy": 2.248077630996704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23077847063541412, "step": 9278 }, { "epoch": 0.1856, "grad_norm": 2.34375, "grad_norm_var": 0.017438761393229165, "learning_rate": 0.0001, "loss": 4.4943, "loss/crossentropy": 2.261389970779419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23669905960559845, "step": 9280 }, { "epoch": 0.18564, "grad_norm": 2.234375, "grad_norm_var": 0.016056060791015625, "learning_rate": 0.0001, "loss": 4.3805, "loss/crossentropy": 2.402338147163391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2453952580690384, "step": 9282 }, { "epoch": 0.18568, "grad_norm": 2.265625, "grad_norm_var": 0.01587702433268229, "learning_rate": 0.0001, "loss": 4.2253, "loss/crossentropy": 2.011174201965332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22673364728689194, "step": 9284 }, { "epoch": 0.18572, "grad_norm": 2.234375, "grad_norm_var": 0.016810862223307292, "learning_rate": 0.0001, "loss": 4.3897, "loss/crossentropy": 1.9700093269348145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22711393237113953, "step": 9286 }, { "epoch": 0.18576, "grad_norm": 2.140625, "grad_norm_var": 0.014922841389973959, "learning_rate": 0.0001, "loss": 3.9609, "loss/crossentropy": 2.0253939032554626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21902555227279663, "step": 9288 }, { "epoch": 0.1858, "grad_norm": 2.296875, "grad_norm_var": 0.016961415608723957, "learning_rate": 0.0001, "loss": 4.1799, "loss/crossentropy": 1.741984784603119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1879509538412094, "step": 9290 }, { "epoch": 0.18584, "grad_norm": 2.296875, "grad_norm_var": 0.022564442952473958, "learning_rate": 0.0001, "loss": 4.7983, "loss/crossentropy": 2.3156943321228027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2687100023031235, "step": 9292 }, { "epoch": 0.18588, "grad_norm": 2.015625, "grad_norm_var": 0.025833892822265624, "learning_rate": 0.0001, "loss": 4.2894, "loss/crossentropy": 2.0826371908187866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21234872937202454, "step": 9294 }, { "epoch": 0.18592, "grad_norm": 2.078125, "grad_norm_var": 0.025921376546223958, "learning_rate": 0.0001, "loss": 4.6473, "loss/crossentropy": 2.4080610275268555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2670409381389618, "step": 9296 }, { "epoch": 0.18596, "grad_norm": 2.046875, "grad_norm_var": 0.02958958943684896, "learning_rate": 0.0001, "loss": 4.1997, "loss/crossentropy": 1.722270905971527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20094333589076996, "step": 9298 }, { "epoch": 0.186, "grad_norm": 2.03125, "grad_norm_var": 0.028527577718098957, "learning_rate": 0.0001, "loss": 4.3247, "loss/crossentropy": 2.0514711141586304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.209539495408535, "step": 9300 }, { "epoch": 0.18604, "grad_norm": 2.640625, "grad_norm_var": 2.5507850646972656, "learning_rate": 0.0001, "loss": 4.9819, "loss/crossentropy": 2.4808409214019775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2367517650127411, "step": 9302 }, { "epoch": 0.18608, "grad_norm": 2.078125, "grad_norm_var": 2.536018880208333, "learning_rate": 0.0001, "loss": 4.1105, "loss/crossentropy": 2.2539944648742676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22834083437919617, "step": 9304 }, { "epoch": 0.18612, "grad_norm": 2.078125, "grad_norm_var": 2.5449544270833333, "learning_rate": 0.0001, "loss": 4.2383, "loss/crossentropy": 1.9335210919380188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20264607667922974, "step": 9306 }, { "epoch": 0.18616, "grad_norm": 2.109375, "grad_norm_var": 2.5707194010416665, "learning_rate": 0.0001, "loss": 4.2285, "loss/crossentropy": 1.90863037109375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20882528275251389, "step": 9308 }, { "epoch": 0.1862, "grad_norm": 2.1875, "grad_norm_var": 2.5589996337890626, "learning_rate": 0.0001, "loss": 4.2195, "loss/crossentropy": 2.1449084281921387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2329169511795044, "step": 9310 }, { "epoch": 0.18624, "grad_norm": 2.015625, "grad_norm_var": 2.567341105143229, "learning_rate": 0.0001, "loss": 4.1806, "loss/crossentropy": 1.9571366906166077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148708775639534, "step": 9312 }, { "epoch": 0.18628, "grad_norm": 2.25, "grad_norm_var": 2.542252604166667, "learning_rate": 0.0001, "loss": 4.4691, "loss/crossentropy": 2.174731135368347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280728593468666, "step": 9314 }, { "epoch": 0.18632, "grad_norm": 1.96875, "grad_norm_var": 2.5416575113932294, "learning_rate": 0.0001, "loss": 4.4344, "loss/crossentropy": 1.9569833874702454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20855706185102463, "step": 9316 }, { "epoch": 0.18636, "grad_norm": 2.1875, "grad_norm_var": 0.0090972900390625, "learning_rate": 0.0001, "loss": 4.3162, "loss/crossentropy": 2.153563976287842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22807130962610245, "step": 9318 }, { "epoch": 0.1864, "grad_norm": 2.203125, "grad_norm_var": 0.010741933186848959, "learning_rate": 0.0001, "loss": 4.3028, "loss/crossentropy": 1.9219058752059937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295902967453003, "step": 9320 }, { "epoch": 0.18644, "grad_norm": 1.96875, "grad_norm_var": 0.011926015218098959, "learning_rate": 0.0001, "loss": 4.29, "loss/crossentropy": 2.1993675231933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.223759263753891, "step": 9322 }, { "epoch": 0.18648, "grad_norm": 2.078125, "grad_norm_var": 0.011730702718098958, "learning_rate": 0.0001, "loss": 4.2254, "loss/crossentropy": 2.173740863800049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23405525833368301, "step": 9324 }, { "epoch": 0.18652, "grad_norm": 2.25, "grad_norm_var": 0.012564849853515626, "learning_rate": 0.0001, "loss": 4.3717, "loss/crossentropy": 2.026577115058899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265823632478714, "step": 9326 }, { "epoch": 0.18656, "grad_norm": 2.671875, "grad_norm_var": 0.030326080322265626, "learning_rate": 0.0001, "loss": 4.4664, "loss/crossentropy": 2.3629637956619263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23659023642539978, "step": 9328 }, { "epoch": 0.1866, "grad_norm": 2.234375, "grad_norm_var": 0.03050715128580729, "learning_rate": 0.0001, "loss": 4.3245, "loss/crossentropy": 2.100727915763855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21468425542116165, "step": 9330 }, { "epoch": 0.18664, "grad_norm": 2.109375, "grad_norm_var": 0.025923411051432293, "learning_rate": 0.0001, "loss": 4.5291, "loss/crossentropy": 2.163568615913391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23402437567710876, "step": 9332 }, { "epoch": 0.18668, "grad_norm": 1.9375, "grad_norm_var": 0.02934748331705729, "learning_rate": 0.0001, "loss": 3.9687, "loss/crossentropy": 1.9579994082450867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19713342934846878, "step": 9334 }, { "epoch": 0.18672, "grad_norm": 1.9609375, "grad_norm_var": 0.029412587483723957, "learning_rate": 0.0001, "loss": 4.2586, "loss/crossentropy": 1.805375874042511, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22647518664598465, "step": 9336 }, { "epoch": 0.18676, "grad_norm": 2.171875, "grad_norm_var": 0.02797215779622396, "learning_rate": 0.0001, "loss": 4.2025, "loss/crossentropy": 2.0764458775520325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2270444855093956, "step": 9338 }, { "epoch": 0.1868, "grad_norm": 2.03125, "grad_norm_var": 0.030987294514973958, "learning_rate": 0.0001, "loss": 3.8731, "loss/crossentropy": 1.5654467940330505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15944529324769974, "step": 9340 }, { "epoch": 0.18684, "grad_norm": 2.15625, "grad_norm_var": 0.02976048787434896, "learning_rate": 0.0001, "loss": 4.3831, "loss/crossentropy": 2.007612407207489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061111181974411, "step": 9342 }, { "epoch": 0.18688, "grad_norm": 1.875, "grad_norm_var": 0.010910797119140624, "learning_rate": 0.0001, "loss": 4.0835, "loss/crossentropy": 1.981432855129242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19339826703071594, "step": 9344 }, { "epoch": 0.18692, "grad_norm": 1.9921875, "grad_norm_var": 0.00965576171875, "learning_rate": 0.0001, "loss": 4.1291, "loss/crossentropy": 1.804275631904602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968545839190483, "step": 9346 }, { "epoch": 0.18696, "grad_norm": 2.078125, "grad_norm_var": 0.008763631184895834, "learning_rate": 0.0001, "loss": 4.2109, "loss/crossentropy": 1.9718617796897888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21482623368501663, "step": 9348 }, { "epoch": 0.187, "grad_norm": 2.15625, "grad_norm_var": 0.008397420247395834, "learning_rate": 0.0001, "loss": 4.1672, "loss/crossentropy": 1.8358338475227356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20042669028043747, "step": 9350 }, { "epoch": 0.18704, "grad_norm": 2.15625, "grad_norm_var": 0.009089914957682292, "learning_rate": 0.0001, "loss": 4.2045, "loss/crossentropy": 1.9447709321975708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21207460761070251, "step": 9352 }, { "epoch": 0.18708, "grad_norm": 2.03125, "grad_norm_var": 0.011356353759765625, "learning_rate": 0.0001, "loss": 4.3134, "loss/crossentropy": 1.8921163082122803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19945629686117172, "step": 9354 }, { "epoch": 0.18712, "grad_norm": 2.171875, "grad_norm_var": 0.009580230712890625, "learning_rate": 0.0001, "loss": 4.2018, "loss/crossentropy": 2.1323947310447693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2258753925561905, "step": 9356 }, { "epoch": 0.18716, "grad_norm": 2.109375, "grad_norm_var": 0.009277089436848959, "learning_rate": 0.0001, "loss": 4.4278, "loss/crossentropy": 2.064914345741272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23450962454080582, "step": 9358 }, { "epoch": 0.1872, "grad_norm": 2.046875, "grad_norm_var": 0.005783843994140625, "learning_rate": 0.0001, "loss": 4.0173, "loss/crossentropy": 2.1264703273773193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2169734537601471, "step": 9360 }, { "epoch": 0.18724, "grad_norm": 2.09375, "grad_norm_var": 0.004784138997395834, "learning_rate": 0.0001, "loss": 4.2926, "loss/crossentropy": 1.7044150233268738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19169463962316513, "step": 9362 }, { "epoch": 0.18728, "grad_norm": 2.0625, "grad_norm_var": 0.004541015625, "learning_rate": 0.0001, "loss": 4.5367, "loss/crossentropy": 1.9475398659706116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22891414165496826, "step": 9364 }, { "epoch": 0.18732, "grad_norm": 1.9140625, "grad_norm_var": 0.008017730712890626, "learning_rate": 0.0001, "loss": 4.1416, "loss/crossentropy": 2.134114623069763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2241017445921898, "step": 9366 }, { "epoch": 0.18736, "grad_norm": 2.1875, "grad_norm_var": 0.007342274983723958, "learning_rate": 0.0001, "loss": 4.2185, "loss/crossentropy": 1.8317620158195496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20189791917800903, "step": 9368 }, { "epoch": 0.1874, "grad_norm": 2.140625, "grad_norm_var": 0.005041249593098958, "learning_rate": 0.0001, "loss": 4.3518, "loss/crossentropy": 2.257538855075836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2343401461839676, "step": 9370 }, { "epoch": 0.18744, "grad_norm": 2.09375, "grad_norm_var": 0.005228424072265625, "learning_rate": 0.0001, "loss": 4.0535, "loss/crossentropy": 2.3722634315490723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23045828938484192, "step": 9372 }, { "epoch": 0.18748, "grad_norm": 2.078125, "grad_norm_var": 0.005222320556640625, "learning_rate": 0.0001, "loss": 4.2293, "loss/crossentropy": 2.1880545020103455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2345728725194931, "step": 9374 }, { "epoch": 0.18752, "grad_norm": 2.3125, "grad_norm_var": 0.008255767822265624, "learning_rate": 0.0001, "loss": 4.5037, "loss/crossentropy": 1.8883287906646729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25232937932014465, "step": 9376 }, { "epoch": 0.18756, "grad_norm": 2.21875, "grad_norm_var": 0.009124501546223959, "learning_rate": 0.0001, "loss": 4.2963, "loss/crossentropy": 1.9253730773925781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20324261486530304, "step": 9378 }, { "epoch": 0.1876, "grad_norm": 2.078125, "grad_norm_var": 0.009211985270182292, "learning_rate": 0.0001, "loss": 4.2745, "loss/crossentropy": 1.961540937423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22623597085475922, "step": 9380 }, { "epoch": 0.18764, "grad_norm": 2.203125, "grad_norm_var": 0.007835896809895833, "learning_rate": 0.0001, "loss": 4.347, "loss/crossentropy": 2.190120279788971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24154195934534073, "step": 9382 }, { "epoch": 0.18768, "grad_norm": 2.140625, "grad_norm_var": 0.007445271809895833, "learning_rate": 0.0001, "loss": 4.6159, "loss/crossentropy": 2.0888350009918213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22750811278820038, "step": 9384 }, { "epoch": 0.18772, "grad_norm": 2.1875, "grad_norm_var": 0.008128865559895834, "learning_rate": 0.0001, "loss": 4.3171, "loss/crossentropy": 1.724816918373108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21245518326759338, "step": 9386 }, { "epoch": 0.18776, "grad_norm": 2.0, "grad_norm_var": 0.009307607014973959, "learning_rate": 0.0001, "loss": 4.0868, "loss/crossentropy": 1.6542762517929077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1961463838815689, "step": 9388 }, { "epoch": 0.1878, "grad_norm": 2.078125, "grad_norm_var": 0.009714508056640625, "learning_rate": 0.0001, "loss": 4.4748, "loss/crossentropy": 2.3528761863708496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23583710938692093, "step": 9390 }, { "epoch": 0.18784, "grad_norm": 2.03125, "grad_norm_var": 0.009012603759765625, "learning_rate": 0.0001, "loss": 4.394, "loss/crossentropy": 2.181188702583313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24201467633247375, "step": 9392 }, { "epoch": 0.18788, "grad_norm": 2.0625, "grad_norm_var": 0.008414459228515626, "learning_rate": 0.0001, "loss": 4.3476, "loss/crossentropy": 2.280683398246765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22065220028162003, "step": 9394 }, { "epoch": 0.18792, "grad_norm": 2.21875, "grad_norm_var": 0.009275054931640625, "learning_rate": 0.0001, "loss": 4.1603, "loss/crossentropy": 1.9839438199996948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22708184272050858, "step": 9396 }, { "epoch": 0.18796, "grad_norm": 2.125, "grad_norm_var": 0.0070879618326822914, "learning_rate": 0.0001, "loss": 4.4322, "loss/crossentropy": 2.311874270439148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22249652445316315, "step": 9398 }, { "epoch": 0.188, "grad_norm": 2.140625, "grad_norm_var": 0.008503214518229166, "learning_rate": 0.0001, "loss": 4.3808, "loss/crossentropy": 2.430112838745117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23544297367334366, "step": 9400 }, { "epoch": 0.18804, "grad_norm": 1.96875, "grad_norm_var": 0.00892333984375, "learning_rate": 0.0001, "loss": 4.2393, "loss/crossentropy": 2.146397888660431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23211465775966644, "step": 9402 }, { "epoch": 0.18808, "grad_norm": 2.203125, "grad_norm_var": 0.008318837483723958, "learning_rate": 0.0001, "loss": 4.5071, "loss/crossentropy": 2.4402170181274414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25594406574964523, "step": 9404 }, { "epoch": 0.18812, "grad_norm": 2.046875, "grad_norm_var": 0.008294423421223959, "learning_rate": 0.0001, "loss": 4.3848, "loss/crossentropy": 2.141621232032776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2156461626291275, "step": 9406 }, { "epoch": 0.18816, "grad_norm": 2.0625, "grad_norm_var": 0.006290435791015625, "learning_rate": 0.0001, "loss": 4.2511, "loss/crossentropy": 2.0266456604003906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20194754749536514, "step": 9408 }, { "epoch": 0.1882, "grad_norm": 1.9765625, "grad_norm_var": 0.007062784830729167, "learning_rate": 0.0001, "loss": 4.2831, "loss/crossentropy": 2.304496645927429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22345459461212158, "step": 9410 }, { "epoch": 0.18824, "grad_norm": 2.25, "grad_norm_var": 0.007523600260416667, "learning_rate": 0.0001, "loss": 4.4085, "loss/crossentropy": 1.8486470580101013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20493299514055252, "step": 9412 }, { "epoch": 0.18828, "grad_norm": 2.09375, "grad_norm_var": 0.0075927734375, "learning_rate": 0.0001, "loss": 4.4562, "loss/crossentropy": 2.0593990683555603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22942744940519333, "step": 9414 }, { "epoch": 0.18832, "grad_norm": 2.15625, "grad_norm_var": 0.006528472900390625, "learning_rate": 0.0001, "loss": 4.1551, "loss/crossentropy": 1.7936111688613892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1776513010263443, "step": 9416 }, { "epoch": 0.18836, "grad_norm": 2.0, "grad_norm_var": 0.005997467041015625, "learning_rate": 0.0001, "loss": 4.4484, "loss/crossentropy": 2.0676616430282593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21748381853103638, "step": 9418 }, { "epoch": 0.1884, "grad_norm": 2.1875, "grad_norm_var": 0.005236562093098958, "learning_rate": 0.0001, "loss": 4.3415, "loss/crossentropy": 2.0317665934562683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215682566165924, "step": 9420 }, { "epoch": 0.18844, "grad_norm": 2.234375, "grad_norm_var": 0.006359608968098959, "learning_rate": 0.0001, "loss": 4.5623, "loss/crossentropy": 2.3345483541488647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24850556254386902, "step": 9422 }, { "epoch": 0.18848, "grad_norm": 2.03125, "grad_norm_var": 0.007043202718098958, "learning_rate": 0.0001, "loss": 4.3526, "loss/crossentropy": 2.0603779554367065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20997442305088043, "step": 9424 }, { "epoch": 0.18852, "grad_norm": 2.15625, "grad_norm_var": 0.006591796875, "learning_rate": 0.0001, "loss": 4.4944, "loss/crossentropy": 1.9450209140777588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21465806663036346, "step": 9426 }, { "epoch": 0.18856, "grad_norm": 2.046875, "grad_norm_var": 0.0065266927083333336, "learning_rate": 0.0001, "loss": 4.2452, "loss/crossentropy": 2.3568087816238403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24619507789611816, "step": 9428 }, { "epoch": 0.1886, "grad_norm": 2.25, "grad_norm_var": 0.007828776041666667, "learning_rate": 0.0001, "loss": 4.1976, "loss/crossentropy": 2.2047020196914673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2519628629088402, "step": 9430 }, { "epoch": 0.18864, "grad_norm": 1.9609375, "grad_norm_var": 0.010188547770182292, "learning_rate": 0.0001, "loss": 4.3168, "loss/crossentropy": 2.1594117879867554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21274243295192719, "step": 9432 }, { "epoch": 0.18868, "grad_norm": 2.25, "grad_norm_var": 0.009936269124348958, "learning_rate": 0.0001, "loss": 4.3847, "loss/crossentropy": 2.120336890220642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2092270776629448, "step": 9434 }, { "epoch": 0.18872, "grad_norm": 2.015625, "grad_norm_var": 0.011120351155598958, "learning_rate": 0.0001, "loss": 4.2725, "loss/crossentropy": 2.13198459148407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.238224595785141, "step": 9436 }, { "epoch": 0.18876, "grad_norm": 2.09375, "grad_norm_var": 0.010009511311848959, "learning_rate": 0.0001, "loss": 4.3706, "loss/crossentropy": 1.9252395629882812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22254322469234467, "step": 9438 }, { "epoch": 0.1888, "grad_norm": 2.09375, "grad_norm_var": 0.0123291015625, "learning_rate": 0.0001, "loss": 4.0574, "loss/crossentropy": 2.123266577720642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22276867926120758, "step": 9440 }, { "epoch": 0.18884, "grad_norm": 2.140625, "grad_norm_var": 0.010741170247395833, "learning_rate": 0.0001, "loss": 4.2195, "loss/crossentropy": 1.9219747185707092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18884174525737762, "step": 9442 }, { "epoch": 0.18888, "grad_norm": 2.125, "grad_norm_var": 0.010587565104166667, "learning_rate": 0.0001, "loss": 4.2603, "loss/crossentropy": 2.0122207403182983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20913395285606384, "step": 9444 }, { "epoch": 0.18892, "grad_norm": 1.953125, "grad_norm_var": 0.010632069905598958, "learning_rate": 0.0001, "loss": 4.0871, "loss/crossentropy": 2.0255361199378967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1982431635260582, "step": 9446 }, { "epoch": 0.18896, "grad_norm": 1.9921875, "grad_norm_var": 0.008770497639973958, "learning_rate": 0.0001, "loss": 4.4085, "loss/crossentropy": 1.8060500025749207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19787351042032242, "step": 9448 }, { "epoch": 0.189, "grad_norm": 2.171875, "grad_norm_var": 0.007389068603515625, "learning_rate": 0.0001, "loss": 4.3799, "loss/crossentropy": 2.340656042098999, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22321298718452454, "step": 9450 }, { "epoch": 0.18904, "grad_norm": 1.9296875, "grad_norm_var": 0.00966796875, "learning_rate": 0.0001, "loss": 4.3973, "loss/crossentropy": 2.35786235332489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356308028101921, "step": 9452 }, { "epoch": 0.18908, "grad_norm": 2.0625, "grad_norm_var": 0.01002197265625, "learning_rate": 0.0001, "loss": 4.2622, "loss/crossentropy": 2.344806671142578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21194154769182205, "step": 9454 }, { "epoch": 0.18912, "grad_norm": 2.046875, "grad_norm_var": 0.008786773681640625, "learning_rate": 0.0001, "loss": 4.4861, "loss/crossentropy": 2.2449493408203125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21038557589054108, "step": 9456 }, { "epoch": 0.18916, "grad_norm": 1.8515625, "grad_norm_var": 0.014518229166666667, "learning_rate": 0.0001, "loss": 4.0343, "loss/crossentropy": 1.977162778377533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20586547255516052, "step": 9458 }, { "epoch": 0.1892, "grad_norm": 2.203125, "grad_norm_var": 0.015860748291015626, "learning_rate": 0.0001, "loss": 4.3453, "loss/crossentropy": 2.0941065549850464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2197253629565239, "step": 9460 }, { "epoch": 0.18924, "grad_norm": 2.03125, "grad_norm_var": 0.0143463134765625, "learning_rate": 0.0001, "loss": 4.3479, "loss/crossentropy": 2.5145565271377563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25138507783412933, "step": 9462 }, { "epoch": 0.18928, "grad_norm": 2.0, "grad_norm_var": 0.014289347330729167, "learning_rate": 0.0001, "loss": 4.4076, "loss/crossentropy": 2.2870718240737915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2298167496919632, "step": 9464 }, { "epoch": 0.18932, "grad_norm": 2.109375, "grad_norm_var": 0.013678995768229167, "learning_rate": 0.0001, "loss": 4.3588, "loss/crossentropy": 2.2095978260040283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217079259455204, "step": 9466 }, { "epoch": 0.18936, "grad_norm": 2.140625, "grad_norm_var": 0.011726633707682291, "learning_rate": 0.0001, "loss": 4.3735, "loss/crossentropy": 1.9591819047927856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21825183182954788, "step": 9468 }, { "epoch": 0.1894, "grad_norm": 2.1875, "grad_norm_var": 0.011352284749348959, "learning_rate": 0.0001, "loss": 4.5072, "loss/crossentropy": 2.266845226287842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24172072112560272, "step": 9470 }, { "epoch": 0.18944, "grad_norm": 2.203125, "grad_norm_var": 0.011437733968098959, "learning_rate": 0.0001, "loss": 4.3448, "loss/crossentropy": 2.0732688903808594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23074156790971756, "step": 9472 }, { "epoch": 0.18948, "grad_norm": 2.34375, "grad_norm_var": 0.0090728759765625, "learning_rate": 0.0001, "loss": 4.6697, "loss/crossentropy": 1.935340702533722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2194545865058899, "step": 9474 }, { "epoch": 0.18952, "grad_norm": 2.125, "grad_norm_var": 0.007594553629557291, "learning_rate": 0.0001, "loss": 4.1518, "loss/crossentropy": 1.795669674873352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18794939666986465, "step": 9476 }, { "epoch": 0.18956, "grad_norm": 2.109375, "grad_norm_var": 0.008841705322265626, "learning_rate": 0.0001, "loss": 3.7745, "loss/crossentropy": 1.9292446970939636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20147182047367096, "step": 9478 }, { "epoch": 0.1896, "grad_norm": 2.09375, "grad_norm_var": 0.0067779541015625, "learning_rate": 0.0001, "loss": 4.1955, "loss/crossentropy": 2.0110061168670654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2096811756491661, "step": 9480 }, { "epoch": 0.18964, "grad_norm": 1.9921875, "grad_norm_var": 0.008034006754557291, "learning_rate": 0.0001, "loss": 4.2937, "loss/crossentropy": 2.1543468236923218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142205834388733, "step": 9482 }, { "epoch": 0.18968, "grad_norm": 2.046875, "grad_norm_var": 0.009723917643229166, "learning_rate": 0.0001, "loss": 4.0126, "loss/crossentropy": 2.0860520601272583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21716032177209854, "step": 9484 }, { "epoch": 0.18972, "grad_norm": 1.9921875, "grad_norm_var": 0.010241444905598958, "learning_rate": 0.0001, "loss": 3.9412, "loss/crossentropy": 1.7190409302711487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19397014379501343, "step": 9486 }, { "epoch": 0.18976, "grad_norm": 2.234375, "grad_norm_var": 0.017116038004557292, "learning_rate": 0.0001, "loss": 4.4711, "loss/crossentropy": 2.178081512451172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280048429965973, "step": 9488 }, { "epoch": 0.1898, "grad_norm": 2.109375, "grad_norm_var": 0.013063303629557292, "learning_rate": 0.0001, "loss": 4.267, "loss/crossentropy": 2.0749863982200623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22755203396081924, "step": 9490 }, { "epoch": 0.18984, "grad_norm": 2.09375, "grad_norm_var": 0.014098866780598959, "learning_rate": 0.0001, "loss": 4.3147, "loss/crossentropy": 2.214204430580139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23071825504302979, "step": 9492 }, { "epoch": 0.18988, "grad_norm": 2.09375, "grad_norm_var": 0.015794881184895835, "learning_rate": 0.0001, "loss": 4.0041, "loss/crossentropy": 1.6625414490699768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1857389286160469, "step": 9494 }, { "epoch": 0.18992, "grad_norm": 2.375, "grad_norm_var": 0.020539347330729166, "learning_rate": 0.0001, "loss": 4.4474, "loss/crossentropy": 1.8537201285362244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21718977391719818, "step": 9496 }, { "epoch": 0.18996, "grad_norm": 1.84375, "grad_norm_var": 0.024074045817057292, "learning_rate": 0.0001, "loss": 4.0488, "loss/crossentropy": 1.716725468635559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18544895946979523, "step": 9498 }, { "epoch": 0.19, "grad_norm": 1.9453125, "grad_norm_var": 0.024192047119140626, "learning_rate": 0.0001, "loss": 4.1035, "loss/crossentropy": 1.94467431306839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210828959941864, "step": 9500 }, { "epoch": 0.19004, "grad_norm": 2.015625, "grad_norm_var": 0.023298136393229165, "learning_rate": 0.0001, "loss": 4.2383, "loss/crossentropy": 1.9377062320709229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22141354531049728, "step": 9502 }, { "epoch": 0.19008, "grad_norm": 2.0, "grad_norm_var": 0.016950480143229165, "learning_rate": 0.0001, "loss": 4.3717, "loss/crossentropy": 2.2015358209609985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22363336384296417, "step": 9504 }, { "epoch": 0.19012, "grad_norm": 2.109375, "grad_norm_var": 0.017235310872395833, "learning_rate": 0.0001, "loss": 4.2332, "loss/crossentropy": 2.373024582862854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2402234748005867, "step": 9506 }, { "epoch": 0.19016, "grad_norm": 2.140625, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 4.0519, "loss/crossentropy": 2.1573110222816467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24207139760255814, "step": 9508 }, { "epoch": 0.1902, "grad_norm": 2.203125, "grad_norm_var": 0.014788564046223958, "learning_rate": 0.0001, "loss": 4.4041, "loss/crossentropy": 2.324455976486206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24618541449308395, "step": 9510 }, { "epoch": 0.19024, "grad_norm": 2.078125, "grad_norm_var": 0.010603586832682291, "learning_rate": 0.0001, "loss": 4.0065, "loss/crossentropy": 1.7480111718177795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049119919538498, "step": 9512 }, { "epoch": 0.19028, "grad_norm": 2.125, "grad_norm_var": 0.006300608317057292, "learning_rate": 0.0001, "loss": 4.3644, "loss/crossentropy": 1.9925439953804016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2314591035246849, "step": 9514 }, { "epoch": 0.19032, "grad_norm": 2.140625, "grad_norm_var": 0.007094065348307292, "learning_rate": 0.0001, "loss": 4.0702, "loss/crossentropy": 2.133601188659668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2329796403646469, "step": 9516 }, { "epoch": 0.19036, "grad_norm": 1.984375, "grad_norm_var": 0.010198720296223958, "learning_rate": 0.0001, "loss": 4.246, "loss/crossentropy": 2.093464970588684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22636859863996506, "step": 9518 }, { "epoch": 0.1904, "grad_norm": 2.0625, "grad_norm_var": 0.009757232666015626, "learning_rate": 0.0001, "loss": 4.4766, "loss/crossentropy": 2.6137614250183105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27037859708070755, "step": 9520 }, { "epoch": 0.19044, "grad_norm": 2.125, "grad_norm_var": 0.012300364176432292, "learning_rate": 0.0001, "loss": 4.5499, "loss/crossentropy": 2.008640229701996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21647297590970993, "step": 9522 }, { "epoch": 0.19048, "grad_norm": 2.078125, "grad_norm_var": 0.012286122639973958, "learning_rate": 0.0001, "loss": 4.2467, "loss/crossentropy": 2.12644362449646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23370585590600967, "step": 9524 }, { "epoch": 0.19052, "grad_norm": 2.109375, "grad_norm_var": 0.011425526936848958, "learning_rate": 0.0001, "loss": 4.5409, "loss/crossentropy": 2.2338638305664062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23637598007917404, "step": 9526 }, { "epoch": 0.19056, "grad_norm": 2.0, "grad_norm_var": 0.011785634358723958, "learning_rate": 0.0001, "loss": 4.1405, "loss/crossentropy": 2.0632832646369934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20682457089424133, "step": 9528 }, { "epoch": 0.1906, "grad_norm": 1.9921875, "grad_norm_var": 0.012726847330729167, "learning_rate": 0.0001, "loss": 4.4058, "loss/crossentropy": 2.219120740890503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22551076859235764, "step": 9530 }, { "epoch": 0.19064, "grad_norm": 1.953125, "grad_norm_var": 0.011860911051432292, "learning_rate": 0.0001, "loss": 4.2985, "loss/crossentropy": 2.4633371829986572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24491076171398163, "step": 9532 }, { "epoch": 0.19068, "grad_norm": 2.03125, "grad_norm_var": 0.008211008707682292, "learning_rate": 0.0001, "loss": 4.2152, "loss/crossentropy": 2.2408339977264404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22265981882810593, "step": 9534 }, { "epoch": 0.19072, "grad_norm": 2.234375, "grad_norm_var": 0.009544881184895833, "learning_rate": 0.0001, "loss": 4.326, "loss/crossentropy": 1.9779353141784668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159332111477852, "step": 9536 }, { "epoch": 0.19076, "grad_norm": 2.265625, "grad_norm_var": 0.0082275390625, "learning_rate": 0.0001, "loss": 4.3522, "loss/crossentropy": 2.0315812826156616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20710154622793198, "step": 9538 }, { "epoch": 0.1908, "grad_norm": 2.078125, "grad_norm_var": 0.011787923177083333, "learning_rate": 0.0001, "loss": 4.18, "loss/crossentropy": 2.085337817668915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22174393385648727, "step": 9540 }, { "epoch": 0.19084, "grad_norm": 2.03125, "grad_norm_var": 0.016507975260416665, "learning_rate": 0.0001, "loss": 4.4386, "loss/crossentropy": 2.3449169397354126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3135230466723442, "step": 9542 }, { "epoch": 0.19088, "grad_norm": 2.046875, "grad_norm_var": 0.016169230143229168, "learning_rate": 0.0001, "loss": 4.3943, "loss/crossentropy": 2.1083431243896484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21395261585712433, "step": 9544 }, { "epoch": 0.19092, "grad_norm": 2.03125, "grad_norm_var": 0.015730539957682293, "learning_rate": 0.0001, "loss": 4.2091, "loss/crossentropy": 2.0386710166931152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22487390786409378, "step": 9546 }, { "epoch": 0.19096, "grad_norm": 2.21875, "grad_norm_var": 0.015240224202473958, "learning_rate": 0.0001, "loss": 4.2377, "loss/crossentropy": 1.9533087611198425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2223353162407875, "step": 9548 }, { "epoch": 0.191, "grad_norm": 2.21875, "grad_norm_var": 0.015317535400390625, "learning_rate": 0.0001, "loss": 4.4183, "loss/crossentropy": 2.35861599445343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26162558794021606, "step": 9550 }, { "epoch": 0.19104, "grad_norm": 2.015625, "grad_norm_var": 0.014388020833333333, "learning_rate": 0.0001, "loss": 4.2507, "loss/crossentropy": 1.9529705047607422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125585675239563, "step": 9552 }, { "epoch": 0.19108, "grad_norm": 2.109375, "grad_norm_var": 0.011844889322916666, "learning_rate": 0.0001, "loss": 4.2041, "loss/crossentropy": 2.0364453196525574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21163037419319153, "step": 9554 }, { "epoch": 0.19112, "grad_norm": 2.078125, "grad_norm_var": 0.009749348958333333, "learning_rate": 0.0001, "loss": 4.2758, "loss/crossentropy": 2.1321409940719604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24250106513500214, "step": 9556 }, { "epoch": 0.19116, "grad_norm": 2.125, "grad_norm_var": 0.0073931376139322914, "learning_rate": 0.0001, "loss": 4.1848, "loss/crossentropy": 2.024011969566345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20132286846637726, "step": 9558 }, { "epoch": 0.1912, "grad_norm": 2.109375, "grad_norm_var": 0.006980133056640625, "learning_rate": 0.0001, "loss": 4.3726, "loss/crossentropy": 2.106776535511017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23114337772130966, "step": 9560 }, { "epoch": 0.19124, "grad_norm": 2.265625, "grad_norm_var": 0.007645416259765625, "learning_rate": 0.0001, "loss": 4.4642, "loss/crossentropy": 1.9228236079216003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22426098585128784, "step": 9562 }, { "epoch": 0.19128, "grad_norm": 2.171875, "grad_norm_var": 0.008090972900390625, "learning_rate": 0.0001, "loss": 4.2384, "loss/crossentropy": 2.3033370971679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20113499462604523, "step": 9564 }, { "epoch": 0.19132, "grad_norm": 2.125, "grad_norm_var": 0.008973948160807292, "learning_rate": 0.0001, "loss": 4.0736, "loss/crossentropy": 1.6983963251113892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19003060460090637, "step": 9566 }, { "epoch": 0.19136, "grad_norm": 2.34375, "grad_norm_var": 0.011295318603515625, "learning_rate": 0.0001, "loss": 4.5283, "loss/crossentropy": 2.502004861831665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24949797987937927, "step": 9568 }, { "epoch": 0.1914, "grad_norm": 2.03125, "grad_norm_var": 0.011793772379557291, "learning_rate": 0.0001, "loss": 4.2004, "loss/crossentropy": 1.9981504678726196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152494639158249, "step": 9570 }, { "epoch": 0.19144, "grad_norm": 1.96875, "grad_norm_var": 0.012931060791015626, "learning_rate": 0.0001, "loss": 4.1211, "loss/crossentropy": 2.1489784717559814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22105325013399124, "step": 9572 }, { "epoch": 0.19148, "grad_norm": 1.9453125, "grad_norm_var": 0.013016510009765624, "learning_rate": 0.0001, "loss": 4.2234, "loss/crossentropy": 1.974421203136444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23960395902395248, "step": 9574 }, { "epoch": 0.19152, "grad_norm": 2.09375, "grad_norm_var": 0.012670644124348958, "learning_rate": 0.0001, "loss": 4.4119, "loss/crossentropy": 2.379599928855896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26731471717357635, "step": 9576 }, { "epoch": 0.19156, "grad_norm": 2.015625, "grad_norm_var": 0.012444814046223959, "learning_rate": 0.0001, "loss": 4.3718, "loss/crossentropy": 2.26211154460907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22957566380500793, "step": 9578 }, { "epoch": 0.1916, "grad_norm": 2.125, "grad_norm_var": 0.011871083577473959, "learning_rate": 0.0001, "loss": 4.4594, "loss/crossentropy": 2.282869577407837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2404445931315422, "step": 9580 }, { "epoch": 0.19164, "grad_norm": 2.03125, "grad_norm_var": 0.010457102457682292, "learning_rate": 0.0001, "loss": 4.1592, "loss/crossentropy": 1.9183810949325562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18672315031290054, "step": 9582 }, { "epoch": 0.19168, "grad_norm": 2.046875, "grad_norm_var": 0.0076812744140625, "learning_rate": 0.0001, "loss": 4.1801, "loss/crossentropy": 2.2722173929214478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23262400180101395, "step": 9584 }, { "epoch": 0.19172, "grad_norm": 2.0625, "grad_norm_var": 0.0075266520182291664, "learning_rate": 0.0001, "loss": 4.4032, "loss/crossentropy": 2.426178455352783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23461100459098816, "step": 9586 }, { "epoch": 0.19176, "grad_norm": 2.09375, "grad_norm_var": 0.00687255859375, "learning_rate": 0.0001, "loss": 4.344, "loss/crossentropy": 2.2266165018081665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22530251741409302, "step": 9588 }, { "epoch": 0.1918, "grad_norm": 2.078125, "grad_norm_var": 0.005541737874348958, "learning_rate": 0.0001, "loss": 4.1633, "loss/crossentropy": 1.786275327205658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19257958233356476, "step": 9590 }, { "epoch": 0.19184, "grad_norm": 2.0, "grad_norm_var": 0.008868153889973958, "learning_rate": 0.0001, "loss": 4.2367, "loss/crossentropy": 1.8497431874275208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21965742111206055, "step": 9592 }, { "epoch": 0.19188, "grad_norm": 2.078125, "grad_norm_var": 0.008143870035807292, "learning_rate": 0.0001, "loss": 4.5612, "loss/crossentropy": 2.492846131324768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21851783990859985, "step": 9594 }, { "epoch": 0.19192, "grad_norm": 2.28125, "grad_norm_var": 0.010027821858723958, "learning_rate": 0.0001, "loss": 4.6838, "loss/crossentropy": 2.3447986841201782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26567359268665314, "step": 9596 }, { "epoch": 0.19196, "grad_norm": 1.984375, "grad_norm_var": 0.010654449462890625, "learning_rate": 0.0001, "loss": 4.1513, "loss/crossentropy": 1.656063199043274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1903422325849533, "step": 9598 }, { "epoch": 0.192, "grad_norm": 2.125, "grad_norm_var": 0.008329264322916667, "learning_rate": 0.0001, "loss": 4.3252, "loss/crossentropy": 2.1816134452819824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23314762860536575, "step": 9600 }, { "epoch": 0.19204, "grad_norm": 2.015625, "grad_norm_var": 0.010007476806640625, "learning_rate": 0.0001, "loss": 3.8537, "loss/crossentropy": 1.787261426448822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20938758552074432, "step": 9602 }, { "epoch": 0.19208, "grad_norm": 2.015625, "grad_norm_var": 0.010526275634765625, "learning_rate": 0.0001, "loss": 4.2374, "loss/crossentropy": 1.9722678065299988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23085469752550125, "step": 9604 }, { "epoch": 0.19212, "grad_norm": 1.9453125, "grad_norm_var": 0.012113444010416667, "learning_rate": 0.0001, "loss": 4.0954, "loss/crossentropy": 2.04559987783432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21647220849990845, "step": 9606 }, { "epoch": 0.19216, "grad_norm": 2.140625, "grad_norm_var": 0.008934529622395833, "learning_rate": 0.0001, "loss": 4.5397, "loss/crossentropy": 2.5426105260849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23680832237005234, "step": 9608 }, { "epoch": 0.1922, "grad_norm": 2.0, "grad_norm_var": 0.008890787760416666, "learning_rate": 0.0001, "loss": 4.245, "loss/crossentropy": 2.1339075565338135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22501112520694733, "step": 9610 }, { "epoch": 0.19224, "grad_norm": 2.03125, "grad_norm_var": 0.006550852457682292, "learning_rate": 0.0001, "loss": 4.0924, "loss/crossentropy": 1.9554831981658936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20187357813119888, "step": 9612 }, { "epoch": 0.19228, "grad_norm": 2.203125, "grad_norm_var": 0.009065500895182292, "learning_rate": 0.0001, "loss": 4.5578, "loss/crossentropy": 2.2996249198913574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24954287707805634, "step": 9614 }, { "epoch": 0.19232, "grad_norm": 2.125, "grad_norm_var": 0.009065500895182292, "learning_rate": 0.0001, "loss": 4.3753, "loss/crossentropy": 2.2439414262771606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22833245247602463, "step": 9616 }, { "epoch": 0.19236, "grad_norm": 2.046875, "grad_norm_var": 0.007933553059895833, "learning_rate": 0.0001, "loss": 4.2297, "loss/crossentropy": 1.8900890946388245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21077623218297958, "step": 9618 }, { "epoch": 0.1924, "grad_norm": 2.03125, "grad_norm_var": 0.007804361979166666, "learning_rate": 0.0001, "loss": 4.287, "loss/crossentropy": 2.110726058483124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21897974610328674, "step": 9620 }, { "epoch": 0.19244, "grad_norm": 1.9765625, "grad_norm_var": 0.0080474853515625, "learning_rate": 0.0001, "loss": 4.0135, "loss/crossentropy": 1.7363090515136719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19939633458852768, "step": 9622 }, { "epoch": 0.19248, "grad_norm": 2.109375, "grad_norm_var": 0.01014404296875, "learning_rate": 0.0001, "loss": 4.3643, "loss/crossentropy": 1.8148014545440674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21637701243162155, "step": 9624 }, { "epoch": 0.19252, "grad_norm": 2.03125, "grad_norm_var": 0.0092041015625, "learning_rate": 0.0001, "loss": 4.2451, "loss/crossentropy": 2.2895134687423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2297188639640808, "step": 9626 }, { "epoch": 0.19256, "grad_norm": 2.15625, "grad_norm_var": 0.006296539306640625, "learning_rate": 0.0001, "loss": 4.4778, "loss/crossentropy": 2.117924213409424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24057473242282867, "step": 9628 }, { "epoch": 0.1926, "grad_norm": 2.140625, "grad_norm_var": 0.005421702067057292, "learning_rate": 0.0001, "loss": 4.2791, "loss/crossentropy": 1.8709319829940796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106107696890831, "step": 9630 }, { "epoch": 0.19264, "grad_norm": 1.9140625, "grad_norm_var": 0.0079742431640625, "learning_rate": 0.0001, "loss": 4.0174, "loss/crossentropy": 1.5156871676445007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17487400770187378, "step": 9632 }, { "epoch": 0.19268, "grad_norm": 2.1875, "grad_norm_var": 0.0183837890625, "learning_rate": 0.0001, "loss": 4.6281, "loss/crossentropy": 2.153126537799835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22009101510047913, "step": 9634 }, { "epoch": 0.19272, "grad_norm": 2.09375, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 4.443, "loss/crossentropy": 2.0890414714813232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2233077436685562, "step": 9636 }, { "epoch": 0.19276, "grad_norm": 2.015625, "grad_norm_var": 0.01858495076497396, "learning_rate": 0.0001, "loss": 4.2572, "loss/crossentropy": 2.1218496561050415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22576671838760376, "step": 9638 }, { "epoch": 0.1928, "grad_norm": 1.921875, "grad_norm_var": 0.020918528238932293, "learning_rate": 0.0001, "loss": 4.2522, "loss/crossentropy": 2.1131649017333984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21846124529838562, "step": 9640 }, { "epoch": 0.19284, "grad_norm": 2.1875, "grad_norm_var": 0.02067845662434896, "learning_rate": 0.0001, "loss": 4.6643, "loss/crossentropy": 2.3941714763641357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.246211439371109, "step": 9642 }, { "epoch": 0.19288, "grad_norm": 1.890625, "grad_norm_var": 0.02687352498372396, "learning_rate": 0.0001, "loss": 4.0641, "loss/crossentropy": 1.9579638838768005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198220357298851, "step": 9644 }, { "epoch": 0.19292, "grad_norm": 2.140625, "grad_norm_var": 0.02754491170247396, "learning_rate": 0.0001, "loss": 4.564, "loss/crossentropy": 2.388027787208557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23349716514348984, "step": 9646 }, { "epoch": 0.19296, "grad_norm": 2.15625, "grad_norm_var": 0.02520726521809896, "learning_rate": 0.0001, "loss": 4.3678, "loss/crossentropy": 1.8203087449073792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18923642486333847, "step": 9648 }, { "epoch": 0.193, "grad_norm": 2.03125, "grad_norm_var": 0.015476226806640625, "learning_rate": 0.0001, "loss": 4.2119, "loss/crossentropy": 1.996269702911377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2096453383564949, "step": 9650 }, { "epoch": 0.19304, "grad_norm": 2.09375, "grad_norm_var": 0.01572240193684896, "learning_rate": 0.0001, "loss": 4.2921, "loss/crossentropy": 1.806606113910675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19824761897325516, "step": 9652 }, { "epoch": 0.19308, "grad_norm": 2.28125, "grad_norm_var": 0.01651178995768229, "learning_rate": 0.0001, "loss": 4.1681, "loss/crossentropy": 2.190830111503601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22319861501455307, "step": 9654 }, { "epoch": 0.19312, "grad_norm": 2.15625, "grad_norm_var": 0.014048004150390625, "learning_rate": 0.0001, "loss": 4.4579, "loss/crossentropy": 1.9721493124961853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2011292800307274, "step": 9656 }, { "epoch": 0.19316, "grad_norm": 2.234375, "grad_norm_var": 0.014277903238932292, "learning_rate": 0.0001, "loss": 4.5673, "loss/crossentropy": 2.1256929636001587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2728194147348404, "step": 9658 }, { "epoch": 0.1932, "grad_norm": 2.125, "grad_norm_var": 0.008345286051432291, "learning_rate": 0.0001, "loss": 4.3206, "loss/crossentropy": 2.090156316757202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240355908870697, "step": 9660 }, { "epoch": 0.19324, "grad_norm": 1.9140625, "grad_norm_var": 0.0108795166015625, "learning_rate": 0.0001, "loss": 4.0231, "loss/crossentropy": 2.2930272817611694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23188824206590652, "step": 9662 }, { "epoch": 0.19328, "grad_norm": 1.9453125, "grad_norm_var": 0.0122222900390625, "learning_rate": 0.0001, "loss": 4.0075, "loss/crossentropy": 1.9754068851470947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18698863685131073, "step": 9664 }, { "epoch": 0.19332, "grad_norm": 2.09375, "grad_norm_var": 0.0118560791015625, "learning_rate": 0.0001, "loss": 4.1668, "loss/crossentropy": 1.8987788558006287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22531752288341522, "step": 9666 }, { "epoch": 0.19336, "grad_norm": 2.0625, "grad_norm_var": 0.011678059895833334, "learning_rate": 0.0001, "loss": 4.3185, "loss/crossentropy": 2.2699583768844604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2572309076786041, "step": 9668 }, { "epoch": 0.1934, "grad_norm": 2.0625, "grad_norm_var": 0.0072509765625, "learning_rate": 0.0001, "loss": 4.3267, "loss/crossentropy": 1.6649349927902222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23388104140758514, "step": 9670 }, { "epoch": 0.19344, "grad_norm": 2.171875, "grad_norm_var": 0.007258097330729167, "learning_rate": 0.0001, "loss": 4.3846, "loss/crossentropy": 2.173617362976074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20848755538463593, "step": 9672 }, { "epoch": 0.19348, "grad_norm": 2.03125, "grad_norm_var": 0.005890909830729167, "learning_rate": 0.0001, "loss": 4.3252, "loss/crossentropy": 2.2690787315368652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23493453860282898, "step": 9674 }, { "epoch": 0.19352, "grad_norm": 1.9375, "grad_norm_var": 0.007079060872395833, "learning_rate": 0.0001, "loss": 4.3621, "loss/crossentropy": 2.00560861825943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22538188099861145, "step": 9676 }, { "epoch": 0.19356, "grad_norm": 2.0625, "grad_norm_var": 0.004369862874348958, "learning_rate": 0.0001, "loss": 4.1264, "loss/crossentropy": 1.960120975971222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21056914329528809, "step": 9678 }, { "epoch": 0.1936, "grad_norm": 1.9921875, "grad_norm_var": 0.004689280192057292, "learning_rate": 0.0001, "loss": 3.9868, "loss/crossentropy": 1.8921862840652466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22250327467918396, "step": 9680 }, { "epoch": 0.19364, "grad_norm": 2.0625, "grad_norm_var": 0.004839833577473958, "learning_rate": 0.0001, "loss": 4.296, "loss/crossentropy": 1.9474233984947205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19719959795475006, "step": 9682 }, { "epoch": 0.19368, "grad_norm": 2.21875, "grad_norm_var": 0.0072100321451822914, "learning_rate": 0.0001, "loss": 4.2985, "loss/crossentropy": 2.3391844034194946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22263485193252563, "step": 9684 }, { "epoch": 0.19372, "grad_norm": 2.21875, "grad_norm_var": 0.008715565999348958, "learning_rate": 0.0001, "loss": 4.3641, "loss/crossentropy": 2.190012037754059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22531607002019882, "step": 9686 }, { "epoch": 0.19376, "grad_norm": 2.15625, "grad_norm_var": 0.008283487955729167, "learning_rate": 0.0001, "loss": 4.2601, "loss/crossentropy": 1.9935640096664429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21438511461019516, "step": 9688 }, { "epoch": 0.1938, "grad_norm": 2.15625, "grad_norm_var": 0.009186808268229167, "learning_rate": 0.0001, "loss": 4.1756, "loss/crossentropy": 1.7482191324234009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20972990244627, "step": 9690 }, { "epoch": 0.19384, "grad_norm": 2.125, "grad_norm_var": 0.008353678385416667, "learning_rate": 0.0001, "loss": 4.475, "loss/crossentropy": 2.2413275241851807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24103231728076935, "step": 9692 }, { "epoch": 0.19388, "grad_norm": 2.140625, "grad_norm_var": 0.008519490559895834, "learning_rate": 0.0001, "loss": 4.2886, "loss/crossentropy": 2.2846572399139404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2420315518975258, "step": 9694 }, { "epoch": 0.19392, "grad_norm": 2.265625, "grad_norm_var": 0.007968902587890625, "learning_rate": 0.0001, "loss": 4.1665, "loss/crossentropy": 1.7977504134178162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20145538449287415, "step": 9696 }, { "epoch": 0.19396, "grad_norm": 2.0625, "grad_norm_var": 0.007535552978515625, "learning_rate": 0.0001, "loss": 4.205, "loss/crossentropy": 2.0343902111053467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21595563739538193, "step": 9698 }, { "epoch": 0.194, "grad_norm": 2.125, "grad_norm_var": 0.0052487691243489586, "learning_rate": 0.0001, "loss": 3.9524, "loss/crossentropy": 1.8828233480453491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19624938070774078, "step": 9700 }, { "epoch": 0.19404, "grad_norm": 1.984375, "grad_norm_var": 0.005602773030598958, "learning_rate": 0.0001, "loss": 4.1569, "loss/crossentropy": 1.9177573323249817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19467243552207947, "step": 9702 }, { "epoch": 0.19408, "grad_norm": 1.9296875, "grad_norm_var": 0.006613922119140625, "learning_rate": 0.0001, "loss": 3.8881, "loss/crossentropy": 1.8025588393211365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18757501989603043, "step": 9704 }, { "epoch": 0.19412, "grad_norm": 2.140625, "grad_norm_var": 0.006723785400390625, "learning_rate": 0.0001, "loss": 4.3195, "loss/crossentropy": 2.025633454322815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21623078733682632, "step": 9706 }, { "epoch": 0.19416, "grad_norm": 2.109375, "grad_norm_var": 0.009129842122395834, "learning_rate": 0.0001, "loss": 3.8244, "loss/crossentropy": 1.9421055316925049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20563311874866486, "step": 9708 }, { "epoch": 0.1942, "grad_norm": 2.09375, "grad_norm_var": 0.009191640218098958, "learning_rate": 0.0001, "loss": 4.0977, "loss/crossentropy": 1.9820671081542969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22029083967208862, "step": 9710 }, { "epoch": 0.19424, "grad_norm": 2.328125, "grad_norm_var": 0.010341135660807292, "learning_rate": 0.0001, "loss": 4.2037, "loss/crossentropy": 1.9491158723831177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19559209793806076, "step": 9712 }, { "epoch": 0.19428, "grad_norm": 2.0625, "grad_norm_var": 0.010416412353515625, "learning_rate": 0.0001, "loss": 4.1592, "loss/crossentropy": 1.8653306365013123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18629660457372665, "step": 9714 }, { "epoch": 0.19432, "grad_norm": 1.953125, "grad_norm_var": 0.01950251261393229, "learning_rate": 0.0001, "loss": 4.2567, "loss/crossentropy": 1.6897491812705994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1948539912700653, "step": 9716 }, { "epoch": 0.19436, "grad_norm": 2.203125, "grad_norm_var": 0.020776112874348957, "learning_rate": 0.0001, "loss": 4.4387, "loss/crossentropy": 2.07690966129303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147115021944046, "step": 9718 }, { "epoch": 0.1944, "grad_norm": 2.03125, "grad_norm_var": 0.020189412434895835, "learning_rate": 0.0001, "loss": 4.269, "loss/crossentropy": 2.0356597304344177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20626354217529297, "step": 9720 }, { "epoch": 0.19444, "grad_norm": 2.0625, "grad_norm_var": 0.0204254150390625, "learning_rate": 0.0001, "loss": 4.0548, "loss/crossentropy": 1.7709991931915283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18573015183210373, "step": 9722 }, { "epoch": 0.19448, "grad_norm": 1.984375, "grad_norm_var": 0.01830012003580729, "learning_rate": 0.0001, "loss": 4.117, "loss/crossentropy": 2.0255925059318542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22114800661802292, "step": 9724 }, { "epoch": 0.19452, "grad_norm": 2.25, "grad_norm_var": 0.01871337890625, "learning_rate": 0.0001, "loss": 4.4655, "loss/crossentropy": 2.0046772956848145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22649522870779037, "step": 9726 }, { "epoch": 0.19456, "grad_norm": 2.15625, "grad_norm_var": 0.015827433268229166, "learning_rate": 0.0001, "loss": 4.179, "loss/crossentropy": 2.2746634483337402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23017344623804092, "step": 9728 }, { "epoch": 0.1946, "grad_norm": 2.078125, "grad_norm_var": 0.015184529622395833, "learning_rate": 0.0001, "loss": 4.203, "loss/crossentropy": 2.0191025137901306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20670472085475922, "step": 9730 }, { "epoch": 0.19464, "grad_norm": 2.015625, "grad_norm_var": 0.008177693684895833, "learning_rate": 0.0001, "loss": 4.1657, "loss/crossentropy": 2.3198455572128296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24985665082931519, "step": 9732 }, { "epoch": 0.19468, "grad_norm": 2.09375, "grad_norm_var": 0.0073150634765625, "learning_rate": 0.0001, "loss": 4.3886, "loss/crossentropy": 2.5229711532592773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22920701652765274, "step": 9734 }, { "epoch": 0.19472, "grad_norm": 2.078125, "grad_norm_var": 0.0072662353515625, "learning_rate": 0.0001, "loss": 3.9991, "loss/crossentropy": 2.081319808959961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21542686223983765, "step": 9736 }, { "epoch": 0.19476, "grad_norm": 2.046875, "grad_norm_var": 0.008373006184895834, "learning_rate": 0.0001, "loss": 4.0698, "loss/crossentropy": 2.2170007824897766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21852095425128937, "step": 9738 }, { "epoch": 0.1948, "grad_norm": 1.9453125, "grad_norm_var": 0.008265940348307292, "learning_rate": 0.0001, "loss": 4.1528, "loss/crossentropy": 1.830683708190918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910252571105957, "step": 9740 }, { "epoch": 0.19484, "grad_norm": 2.25, "grad_norm_var": 0.008420562744140625, "learning_rate": 0.0001, "loss": 4.3543, "loss/crossentropy": 2.1303864121437073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.228670135140419, "step": 9742 }, { "epoch": 0.19488, "grad_norm": 2.015625, "grad_norm_var": 0.007486724853515625, "learning_rate": 0.0001, "loss": 4.1504, "loss/crossentropy": 2.2621915340423584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22027206420898438, "step": 9744 }, { "epoch": 0.19492, "grad_norm": 2.0, "grad_norm_var": 0.0110260009765625, "learning_rate": 0.0001, "loss": 3.9564, "loss/crossentropy": 2.044555902481079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20469766855239868, "step": 9746 }, { "epoch": 0.19496, "grad_norm": 2.109375, "grad_norm_var": 0.0115142822265625, "learning_rate": 0.0001, "loss": 4.1677, "loss/crossentropy": 1.911176860332489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22036674618721008, "step": 9748 }, { "epoch": 0.195, "grad_norm": 1.96875, "grad_norm_var": 0.010545857747395833, "learning_rate": 0.0001, "loss": 4.0904, "loss/crossentropy": 1.8650219440460205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19878911972045898, "step": 9750 }, { "epoch": 0.19504, "grad_norm": 2.125, "grad_norm_var": 0.02088623046875, "learning_rate": 0.0001, "loss": 4.3648, "loss/crossentropy": 2.196391463279724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20845064520835876, "step": 9752 }, { "epoch": 0.19508, "grad_norm": 1.9921875, "grad_norm_var": 0.021201324462890626, "learning_rate": 0.0001, "loss": 4.0132, "loss/crossentropy": 2.222484588623047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144618257880211, "step": 9754 }, { "epoch": 0.19512, "grad_norm": 2.15625, "grad_norm_var": 0.020685831705729168, "learning_rate": 0.0001, "loss": 4.4078, "loss/crossentropy": 2.416514754295349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2370949387550354, "step": 9756 }, { "epoch": 0.19516, "grad_norm": 1.9765625, "grad_norm_var": 0.020566558837890624, "learning_rate": 0.0001, "loss": 4.0119, "loss/crossentropy": 2.22190260887146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23128122836351395, "step": 9758 }, { "epoch": 0.1952, "grad_norm": 2.15625, "grad_norm_var": 0.024008941650390626, "learning_rate": 0.0001, "loss": 4.4253, "loss/crossentropy": 2.446492910385132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23962965607643127, "step": 9760 }, { "epoch": 0.19524, "grad_norm": 2.015625, "grad_norm_var": 0.019245402018229166, "learning_rate": 0.0001, "loss": 4.1676, "loss/crossentropy": 2.1458136439323425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19657295942306519, "step": 9762 }, { "epoch": 0.19528, "grad_norm": 2.203125, "grad_norm_var": 0.0197418212890625, "learning_rate": 0.0001, "loss": 4.6029, "loss/crossentropy": 1.9773340225219727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22887050360441208, "step": 9764 }, { "epoch": 0.19532, "grad_norm": 1.9453125, "grad_norm_var": 0.020334625244140626, "learning_rate": 0.0001, "loss": 4.1583, "loss/crossentropy": 2.0938061475753784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19823112338781357, "step": 9766 }, { "epoch": 0.19536, "grad_norm": 2.3125, "grad_norm_var": 0.015636952718098958, "learning_rate": 0.0001, "loss": 4.2628, "loss/crossentropy": 1.9068174958229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19924984872341156, "step": 9768 }, { "epoch": 0.1954, "grad_norm": 2.109375, "grad_norm_var": 0.012422688802083333, "learning_rate": 0.0001, "loss": 4.3567, "loss/crossentropy": 2.045413613319397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19175175577402115, "step": 9770 }, { "epoch": 0.19544, "grad_norm": 2.09375, "grad_norm_var": 0.013106282552083333, "learning_rate": 0.0001, "loss": 4.1972, "loss/crossentropy": 2.0262961983680725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22137057781219482, "step": 9772 }, { "epoch": 0.19548, "grad_norm": 2.0625, "grad_norm_var": 0.012992350260416667, "learning_rate": 0.0001, "loss": 4.0624, "loss/crossentropy": 2.146475672721863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2284562587738037, "step": 9774 }, { "epoch": 0.19552, "grad_norm": 2.015625, "grad_norm_var": 0.011885579427083333, "learning_rate": 0.0001, "loss": 4.154, "loss/crossentropy": 2.0316100120544434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26275022327899933, "step": 9776 }, { "epoch": 0.19556, "grad_norm": 2.140625, "grad_norm_var": 0.011359659830729167, "learning_rate": 0.0001, "loss": 4.5424, "loss/crossentropy": 2.1386696100234985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22695952653884888, "step": 9778 }, { "epoch": 0.1956, "grad_norm": 2.578125, "grad_norm_var": 0.024405924479166667, "learning_rate": 0.0001, "loss": 4.2391, "loss/crossentropy": 2.3000820875167847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24045731872320175, "step": 9780 }, { "epoch": 0.19564, "grad_norm": 2.3125, "grad_norm_var": 0.02415949503580729, "learning_rate": 0.0001, "loss": 4.7778, "loss/crossentropy": 2.1272310614585876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2318389192223549, "step": 9782 }, { "epoch": 0.19568, "grad_norm": 1.9453125, "grad_norm_var": 0.025655110677083332, "learning_rate": 0.0001, "loss": 4.1232, "loss/crossentropy": 1.8953965306282043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.213637076318264, "step": 9784 }, { "epoch": 0.19572, "grad_norm": 2.0625, "grad_norm_var": 0.0261871337890625, "learning_rate": 0.0001, "loss": 4.3459, "loss/crossentropy": 2.0738734006881714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100028172135353, "step": 9786 }, { "epoch": 0.19576, "grad_norm": 2.0, "grad_norm_var": 0.027378082275390625, "learning_rate": 0.0001, "loss": 4.2599, "loss/crossentropy": 1.9454593658447266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18557409197092056, "step": 9788 }, { "epoch": 0.1958, "grad_norm": 2.015625, "grad_norm_var": 0.026569620768229166, "learning_rate": 0.0001, "loss": 4.1655, "loss/crossentropy": 2.101949095726013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22479471564292908, "step": 9790 }, { "epoch": 0.19584, "grad_norm": 2.25, "grad_norm_var": 0.025935872395833334, "learning_rate": 0.0001, "loss": 4.3841, "loss/crossentropy": 1.9524416327476501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20574256777763367, "step": 9792 }, { "epoch": 0.19588, "grad_norm": 2.296875, "grad_norm_var": 0.06084391276041667, "learning_rate": 0.0001, "loss": 4.1596, "loss/crossentropy": 1.9490719437599182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19411193579435349, "step": 9794 }, { "epoch": 0.19592, "grad_norm": 2.046875, "grad_norm_var": 0.050675455729166666, "learning_rate": 0.0001, "loss": 4.3438, "loss/crossentropy": 2.095793664455414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21078093349933624, "step": 9796 }, { "epoch": 0.19596, "grad_norm": 2.015625, "grad_norm_var": 0.0508056640625, "learning_rate": 0.0001, "loss": 4.0828, "loss/crossentropy": 2.032066822052002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21181444078683853, "step": 9798 }, { "epoch": 0.196, "grad_norm": 2.140625, "grad_norm_var": 0.048378245035807295, "learning_rate": 0.0001, "loss": 4.0019, "loss/crossentropy": 2.1378380060195923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224056214094162, "step": 9800 }, { "epoch": 0.19604, "grad_norm": 2.1875, "grad_norm_var": 0.04784520467122396, "learning_rate": 0.0001, "loss": 4.2932, "loss/crossentropy": 2.120614767074585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104548141360283, "step": 9802 }, { "epoch": 0.19608, "grad_norm": 1.8984375, "grad_norm_var": 0.048954010009765625, "learning_rate": 0.0001, "loss": 4.1975, "loss/crossentropy": 1.8837141394615173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19755827635526657, "step": 9804 }, { "epoch": 0.19612, "grad_norm": 2.03125, "grad_norm_var": 0.04793675740559896, "learning_rate": 0.0001, "loss": 4.1645, "loss/crossentropy": 1.9325945973396301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005266472697258, "step": 9806 }, { "epoch": 0.19616, "grad_norm": 2.046875, "grad_norm_var": 0.04839045206705729, "learning_rate": 0.0001, "loss": 4.2211, "loss/crossentropy": 1.789370834827423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20060084015130997, "step": 9808 }, { "epoch": 0.1962, "grad_norm": 1.96875, "grad_norm_var": 0.01709162394205729, "learning_rate": 0.0001, "loss": 4.2463, "loss/crossentropy": 1.9807876348495483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21805762499570847, "step": 9810 }, { "epoch": 0.19624, "grad_norm": 1.9375, "grad_norm_var": 0.018888092041015624, "learning_rate": 0.0001, "loss": 4.1339, "loss/crossentropy": 2.210070848464966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23082506656646729, "step": 9812 }, { "epoch": 0.19628, "grad_norm": 2.0625, "grad_norm_var": 0.01693903605143229, "learning_rate": 0.0001, "loss": 4.1633, "loss/crossentropy": 1.8644117712974548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20300551503896713, "step": 9814 }, { "epoch": 0.19632, "grad_norm": 2.125, "grad_norm_var": 0.01634496053059896, "learning_rate": 0.0001, "loss": 4.2064, "loss/crossentropy": 1.6804233193397522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20438820868730545, "step": 9816 }, { "epoch": 0.19636, "grad_norm": 2.09375, "grad_norm_var": 0.01757990519205729, "learning_rate": 0.0001, "loss": 4.4339, "loss/crossentropy": 2.2513452768325806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24517202377319336, "step": 9818 }, { "epoch": 0.1964, "grad_norm": 2.203125, "grad_norm_var": 0.01683527628580729, "learning_rate": 0.0001, "loss": 4.2238, "loss/crossentropy": 2.3133161067962646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22146832942962646, "step": 9820 }, { "epoch": 0.19644, "grad_norm": 2.1875, "grad_norm_var": 0.016949208577473958, "learning_rate": 0.0001, "loss": 4.5003, "loss/crossentropy": 2.3226611614227295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327018678188324, "step": 9822 }, { "epoch": 0.19648, "grad_norm": 2.078125, "grad_norm_var": 0.016228993733723957, "learning_rate": 0.0001, "loss": 4.1832, "loss/crossentropy": 2.234626054763794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23657850921154022, "step": 9824 }, { "epoch": 0.19652, "grad_norm": 2.25, "grad_norm_var": 0.008941396077473959, "learning_rate": 0.0001, "loss": 4.4576, "loss/crossentropy": 2.3478844165802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24796272069215775, "step": 9826 }, { "epoch": 0.19656, "grad_norm": 2.171875, "grad_norm_var": 0.006982167561848958, "learning_rate": 0.0001, "loss": 4.3678, "loss/crossentropy": 2.094748795032501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21118396520614624, "step": 9828 }, { "epoch": 0.1966, "grad_norm": 2.140625, "grad_norm_var": 0.007289377848307291, "learning_rate": 0.0001, "loss": 4.2916, "loss/crossentropy": 2.166300058364868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21773284673690796, "step": 9830 }, { "epoch": 0.19664, "grad_norm": 2.09375, "grad_norm_var": 0.007212066650390625, "learning_rate": 0.0001, "loss": 4.5253, "loss/crossentropy": 2.198317289352417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22636590898036957, "step": 9832 }, { "epoch": 0.19668, "grad_norm": 2.1875, "grad_norm_var": 0.006278228759765625, "learning_rate": 0.0001, "loss": 4.3405, "loss/crossentropy": 2.3081597089767456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.231675922870636, "step": 9834 }, { "epoch": 0.19672, "grad_norm": 1.9921875, "grad_norm_var": 0.005206044514973958, "learning_rate": 0.0001, "loss": 4.1181, "loss/crossentropy": 2.048017203807831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20089948922395706, "step": 9836 }, { "epoch": 0.19676, "grad_norm": 2.125, "grad_norm_var": 0.01768773396809896, "learning_rate": 0.0001, "loss": 4.0515, "loss/crossentropy": 1.886117160320282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21702590584754944, "step": 9838 }, { "epoch": 0.1968, "grad_norm": 2.0625, "grad_norm_var": 0.017895253499348958, "learning_rate": 0.0001, "loss": 3.9827, "loss/crossentropy": 1.902605414390564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19976041465997696, "step": 9840 }, { "epoch": 0.19684, "grad_norm": 4.28125, "grad_norm_var": 0.309179433186849, "learning_rate": 0.0001, "loss": 4.0874, "loss/crossentropy": 1.7959995865821838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2023170217871666, "step": 9842 }, { "epoch": 0.19688, "grad_norm": 2.15625, "grad_norm_var": 0.3066993713378906, "learning_rate": 0.0001, "loss": 4.5535, "loss/crossentropy": 2.1481738090515137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22133169323205948, "step": 9844 }, { "epoch": 0.19692, "grad_norm": 2.03125, "grad_norm_var": 0.30752741495768227, "learning_rate": 0.0001, "loss": 4.0269, "loss/crossentropy": 1.9328197240829468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20346572250127792, "step": 9846 }, { "epoch": 0.19696, "grad_norm": 2.015625, "grad_norm_var": 0.31202367146809895, "learning_rate": 0.0001, "loss": 4.0299, "loss/crossentropy": 2.096457004547119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21341010928153992, "step": 9848 }, { "epoch": 0.197, "grad_norm": 2.03125, "grad_norm_var": 0.3146522521972656, "learning_rate": 0.0001, "loss": 4.3105, "loss/crossentropy": 1.9698969721794128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1855439990758896, "step": 9850 }, { "epoch": 0.19704, "grad_norm": 2.234375, "grad_norm_var": 0.31038004557291665, "learning_rate": 0.0001, "loss": 4.2165, "loss/crossentropy": 1.987346351146698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23657751083374023, "step": 9852 }, { "epoch": 0.19708, "grad_norm": 1.8984375, "grad_norm_var": 0.3109169006347656, "learning_rate": 0.0001, "loss": 4.0346, "loss/crossentropy": 1.9535572528839111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21131790429353714, "step": 9854 }, { "epoch": 0.19712, "grad_norm": 2.09375, "grad_norm_var": 0.3090349833170573, "learning_rate": 0.0001, "loss": 4.5828, "loss/crossentropy": 2.08541601896286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2290833741426468, "step": 9856 }, { "epoch": 0.19716, "grad_norm": 2.28125, "grad_norm_var": 0.016155751546223958, "learning_rate": 0.0001, "loss": 4.2774, "loss/crossentropy": 2.1930192708969116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23072391748428345, "step": 9858 }, { "epoch": 0.1972, "grad_norm": 2.359375, "grad_norm_var": 0.016658274332682292, "learning_rate": 0.0001, "loss": 4.3923, "loss/crossentropy": 1.8369358777999878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20687467604875565, "step": 9860 }, { "epoch": 0.19724, "grad_norm": 2.0, "grad_norm_var": 0.017380523681640624, "learning_rate": 0.0001, "loss": 4.0287, "loss/crossentropy": 1.6707186102867126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1808091625571251, "step": 9862 }, { "epoch": 0.19728, "grad_norm": 1.984375, "grad_norm_var": 0.017651112874348958, "learning_rate": 0.0001, "loss": 4.1184, "loss/crossentropy": 1.8352991342544556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19169726222753525, "step": 9864 }, { "epoch": 0.19732, "grad_norm": 2.734375, "grad_norm_var": 0.042909495035807294, "learning_rate": 0.0001, "loss": 4.2857, "loss/crossentropy": 1.9427489638328552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20076656341552734, "step": 9866 }, { "epoch": 0.19736, "grad_norm": 2.078125, "grad_norm_var": 0.04237035115559896, "learning_rate": 0.0001, "loss": 4.3725, "loss/crossentropy": 2.2190250158309937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24385111033916473, "step": 9868 }, { "epoch": 0.1974, "grad_norm": 1.9453125, "grad_norm_var": 0.04182510375976563, "learning_rate": 0.0001, "loss": 4.2192, "loss/crossentropy": 2.0958147644996643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23264098167419434, "step": 9870 }, { "epoch": 0.19744, "grad_norm": 2.25, "grad_norm_var": 0.04228897094726562, "learning_rate": 0.0001, "loss": 4.3571, "loss/crossentropy": 2.542472720146179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23458892852067947, "step": 9872 }, { "epoch": 0.19748, "grad_norm": 2.21875, "grad_norm_var": 0.03728408813476562, "learning_rate": 0.0001, "loss": 4.2767, "loss/crossentropy": 2.1562893390655518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2243279591202736, "step": 9874 }, { "epoch": 0.19752, "grad_norm": 2.21875, "grad_norm_var": 0.034395090738932294, "learning_rate": 0.0001, "loss": 4.0114, "loss/crossentropy": 1.9851951599121094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2369420975446701, "step": 9876 }, { "epoch": 0.19756, "grad_norm": 2.03125, "grad_norm_var": 0.032714589436848955, "learning_rate": 0.0001, "loss": 4.3017, "loss/crossentropy": 1.8751549124717712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20777788758277893, "step": 9878 }, { "epoch": 0.1976, "grad_norm": 2.125, "grad_norm_var": 0.0318267822265625, "learning_rate": 0.0001, "loss": 4.1328, "loss/crossentropy": 2.010055720806122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21204733848571777, "step": 9880 }, { "epoch": 0.19764, "grad_norm": 2.0625, "grad_norm_var": 0.008942667643229167, "learning_rate": 0.0001, "loss": 4.1122, "loss/crossentropy": 2.04589307308197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21330490708351135, "step": 9882 }, { "epoch": 0.19768, "grad_norm": 2.125, "grad_norm_var": 0.010138956705729167, "learning_rate": 0.0001, "loss": 4.1663, "loss/crossentropy": 1.9441133737564087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22589778900146484, "step": 9884 }, { "epoch": 0.19772, "grad_norm": 2.25, "grad_norm_var": 0.008296457926432292, "learning_rate": 0.0001, "loss": 4.4239, "loss/crossentropy": 2.305395483970642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24267761409282684, "step": 9886 }, { "epoch": 0.19776, "grad_norm": 2.03125, "grad_norm_var": 0.007269032796223958, "learning_rate": 0.0001, "loss": 4.386, "loss/crossentropy": 2.096014082431793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21358592063188553, "step": 9888 }, { "epoch": 0.1978, "grad_norm": 2.25, "grad_norm_var": 0.008149973551432292, "learning_rate": 0.0001, "loss": 4.2315, "loss/crossentropy": 2.1115033626556396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22994756698608398, "step": 9890 }, { "epoch": 0.19784, "grad_norm": 1.9375, "grad_norm_var": 0.008654530843098958, "learning_rate": 0.0001, "loss": 4.1419, "loss/crossentropy": 1.6122692227363586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1897209882736206, "step": 9892 }, { "epoch": 0.19788, "grad_norm": 2.0625, "grad_norm_var": 0.012933095296223959, "learning_rate": 0.0001, "loss": 4.1679, "loss/crossentropy": 1.4790136218070984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17421242594718933, "step": 9894 }, { "epoch": 0.19792, "grad_norm": 2.09375, "grad_norm_var": 0.013700358072916667, "learning_rate": 0.0001, "loss": 4.2921, "loss/crossentropy": 1.8394885063171387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20840360969305038, "step": 9896 }, { "epoch": 0.19796, "grad_norm": 2.03125, "grad_norm_var": 0.0135894775390625, "learning_rate": 0.0001, "loss": 4.3993, "loss/crossentropy": 2.152444541454315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23834071308374405, "step": 9898 }, { "epoch": 0.198, "grad_norm": 2.1875, "grad_norm_var": 0.012189737955729167, "learning_rate": 0.0001, "loss": 4.5268, "loss/crossentropy": 2.367082357406616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23409543931484222, "step": 9900 }, { "epoch": 0.19804, "grad_norm": 2.125, "grad_norm_var": 0.011799112955729166, "learning_rate": 0.0001, "loss": 4.4599, "loss/crossentropy": 2.136048436164856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21797242760658264, "step": 9902 }, { "epoch": 0.19808, "grad_norm": 2.03125, "grad_norm_var": 0.011945597330729167, "learning_rate": 0.0001, "loss": 4.2262, "loss/crossentropy": 2.035883128643036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.215388223528862, "step": 9904 }, { "epoch": 0.19812, "grad_norm": 2.203125, "grad_norm_var": 0.011970774332682291, "learning_rate": 0.0001, "loss": 4.2024, "loss/crossentropy": 2.0339369773864746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2085876688361168, "step": 9906 }, { "epoch": 0.19816, "grad_norm": 2.0625, "grad_norm_var": 0.009655507405598958, "learning_rate": 0.0001, "loss": 4.4441, "loss/crossentropy": 1.829429566860199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20085029304027557, "step": 9908 }, { "epoch": 0.1982, "grad_norm": 1.984375, "grad_norm_var": 0.008861287434895834, "learning_rate": 0.0001, "loss": 4.1108, "loss/crossentropy": 2.1721774339675903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2118668630719185, "step": 9910 }, { "epoch": 0.19824, "grad_norm": 2.265625, "grad_norm_var": 0.008861287434895834, "learning_rate": 0.0001, "loss": 4.3511, "loss/crossentropy": 1.958261251449585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22304313629865646, "step": 9912 }, { "epoch": 0.19828, "grad_norm": 2.21875, "grad_norm_var": 0.009870402018229167, "learning_rate": 0.0001, "loss": 4.5819, "loss/crossentropy": 2.4651763439178467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2435118407011032, "step": 9914 }, { "epoch": 0.19832, "grad_norm": 1.9140625, "grad_norm_var": 0.012532297770182292, "learning_rate": 0.0001, "loss": 4.0973, "loss/crossentropy": 2.173910617828369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21142201125621796, "step": 9916 }, { "epoch": 0.19836, "grad_norm": 2.078125, "grad_norm_var": 0.013239542643229166, "learning_rate": 0.0001, "loss": 3.9283, "loss/crossentropy": 1.9559763073921204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19051063805818558, "step": 9918 }, { "epoch": 0.1984, "grad_norm": 2.265625, "grad_norm_var": 0.015895334879557292, "learning_rate": 0.0001, "loss": 4.2516, "loss/crossentropy": 2.238003969192505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2338731735944748, "step": 9920 }, { "epoch": 0.19844, "grad_norm": 2.140625, "grad_norm_var": 0.015233357747395834, "learning_rate": 0.0001, "loss": 4.4624, "loss/crossentropy": 2.269726276397705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19430368393659592, "step": 9922 }, { "epoch": 0.19848, "grad_norm": 2.109375, "grad_norm_var": 0.015697224934895834, "learning_rate": 0.0001, "loss": 4.436, "loss/crossentropy": 2.351833701133728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24469508230686188, "step": 9924 }, { "epoch": 0.19852, "grad_norm": 2.15625, "grad_norm_var": 0.013398996988932292, "learning_rate": 0.0001, "loss": 4.3833, "loss/crossentropy": 2.0600146055221558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21375566720962524, "step": 9926 }, { "epoch": 0.19856, "grad_norm": 2.015625, "grad_norm_var": 0.012562815348307292, "learning_rate": 0.0001, "loss": 4.2957, "loss/crossentropy": 2.1543694734573364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20916736871004105, "step": 9928 }, { "epoch": 0.1986, "grad_norm": 2.015625, "grad_norm_var": 0.010782623291015625, "learning_rate": 0.0001, "loss": 4.3111, "loss/crossentropy": 1.8952317833900452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216978445649147, "step": 9930 }, { "epoch": 0.19864, "grad_norm": 2.03125, "grad_norm_var": 0.008695475260416667, "learning_rate": 0.0001, "loss": 4.0542, "loss/crossentropy": 2.102243661880493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22314947098493576, "step": 9932 }, { "epoch": 0.19868, "grad_norm": 1.9921875, "grad_norm_var": 0.0072021484375, "learning_rate": 0.0001, "loss": 4.1653, "loss/crossentropy": 1.9036884307861328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19729873538017273, "step": 9934 }, { "epoch": 0.19872, "grad_norm": 2.0, "grad_norm_var": 0.006459299723307292, "learning_rate": 0.0001, "loss": 4.17, "loss/crossentropy": 1.823796033859253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18089265376329422, "step": 9936 }, { "epoch": 0.19876, "grad_norm": 2.0, "grad_norm_var": 0.006302642822265625, "learning_rate": 0.0001, "loss": 3.8742, "loss/crossentropy": 2.055707633495331, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20620816200971603, "step": 9938 }, { "epoch": 0.1988, "grad_norm": 2.109375, "grad_norm_var": 0.0055010477701822914, "learning_rate": 0.0001, "loss": 4.256, "loss/crossentropy": 2.0490049719810486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21709006279706955, "step": 9940 }, { "epoch": 0.19884, "grad_norm": 2.0625, "grad_norm_var": 0.005891672770182292, "learning_rate": 0.0001, "loss": 4.2733, "loss/crossentropy": 2.164198637008667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2335330694913864, "step": 9942 }, { "epoch": 0.19888, "grad_norm": 1.9609375, "grad_norm_var": 0.006941731770833333, "learning_rate": 0.0001, "loss": 4.1463, "loss/crossentropy": 1.9218478202819824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1945241093635559, "step": 9944 }, { "epoch": 0.19892, "grad_norm": 2.109375, "grad_norm_var": 0.007328033447265625, "learning_rate": 0.0001, "loss": 3.8591, "loss/crossentropy": 1.9456552267074585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20599794387817383, "step": 9946 }, { "epoch": 0.19896, "grad_norm": 1.96875, "grad_norm_var": 0.007749176025390625, "learning_rate": 0.0001, "loss": 4.0385, "loss/crossentropy": 2.0472273230552673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20920252054929733, "step": 9948 }, { "epoch": 0.199, "grad_norm": 1.96875, "grad_norm_var": 0.007860310872395833, "learning_rate": 0.0001, "loss": 4.308, "loss/crossentropy": 2.2252047061920166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2360471710562706, "step": 9950 }, { "epoch": 0.19904, "grad_norm": 2.09375, "grad_norm_var": 0.005537923177083333, "learning_rate": 0.0001, "loss": 4.0828, "loss/crossentropy": 1.646431565284729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1692601889371872, "step": 9952 }, { "epoch": 0.19908, "grad_norm": 2.046875, "grad_norm_var": 0.0054443359375, "learning_rate": 0.0001, "loss": 3.8818, "loss/crossentropy": 2.0114784836769104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20070793479681015, "step": 9954 }, { "epoch": 0.19912, "grad_norm": 1.953125, "grad_norm_var": 0.00677490234375, "learning_rate": 0.0001, "loss": 4.324, "loss/crossentropy": 2.001778781414032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21009314060211182, "step": 9956 }, { "epoch": 0.19916, "grad_norm": 2.0, "grad_norm_var": 0.006037394205729167, "learning_rate": 0.0001, "loss": 4.3493, "loss/crossentropy": 2.1330565214157104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22442802786827087, "step": 9958 }, { "epoch": 0.1992, "grad_norm": 1.9921875, "grad_norm_var": 0.005635579427083333, "learning_rate": 0.0001, "loss": 4.193, "loss/crossentropy": 1.9146793484687805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21145610511302948, "step": 9960 }, { "epoch": 0.19924, "grad_norm": 2.015625, "grad_norm_var": 0.004705556233723958, "learning_rate": 0.0001, "loss": 4.3262, "loss/crossentropy": 2.5224483013153076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23354032635688782, "step": 9962 }, { "epoch": 0.19928, "grad_norm": 2.109375, "grad_norm_var": 0.004552968343098958, "learning_rate": 0.0001, "loss": 4.3663, "loss/crossentropy": 2.245160937309265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21848157793283463, "step": 9964 }, { "epoch": 0.19932, "grad_norm": 2.15625, "grad_norm_var": 0.004622141520182292, "learning_rate": 0.0001, "loss": 4.3506, "loss/crossentropy": 2.122299015522003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21853189170360565, "step": 9966 }, { "epoch": 0.19936, "grad_norm": 2.171875, "grad_norm_var": 0.005716705322265625, "learning_rate": 0.0001, "loss": 4.4524, "loss/crossentropy": 2.31084668636322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23691194504499435, "step": 9968 }, { "epoch": 0.1994, "grad_norm": 2.25, "grad_norm_var": 0.007834625244140626, "learning_rate": 0.0001, "loss": 4.274, "loss/crossentropy": 2.2242307662963867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21957046538591385, "step": 9970 }, { "epoch": 0.19944, "grad_norm": 2.078125, "grad_norm_var": 0.007063547770182292, "learning_rate": 0.0001, "loss": 4.2264, "loss/crossentropy": 1.792852759361267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19996580481529236, "step": 9972 }, { "epoch": 0.19948, "grad_norm": 2.09375, "grad_norm_var": 0.0063250223795572914, "learning_rate": 0.0001, "loss": 4.3029, "loss/crossentropy": 1.9593411087989807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21858739852905273, "step": 9974 }, { "epoch": 0.19952, "grad_norm": 1.9609375, "grad_norm_var": 0.0067827860514322914, "learning_rate": 0.0001, "loss": 4.3067, "loss/crossentropy": 2.259618401527405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22094043344259262, "step": 9976 }, { "epoch": 0.19956, "grad_norm": 2.140625, "grad_norm_var": 0.0063168843587239586, "learning_rate": 0.0001, "loss": 4.373, "loss/crossentropy": 2.31876802444458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22025877982378006, "step": 9978 }, { "epoch": 0.1996, "grad_norm": 2.0625, "grad_norm_var": 0.006109364827473958, "learning_rate": 0.0001, "loss": 4.15, "loss/crossentropy": 1.7625555396080017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21313250809907913, "step": 9980 }, { "epoch": 0.19964, "grad_norm": 2.265625, "grad_norm_var": 0.007342274983723958, "learning_rate": 0.0001, "loss": 4.4628, "loss/crossentropy": 2.19295072555542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23694515973329544, "step": 9982 }, { "epoch": 0.19968, "grad_norm": 2.0625, "grad_norm_var": 0.008739217122395834, "learning_rate": 0.0001, "loss": 4.187, "loss/crossentropy": 2.4073877334594727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22179137915372849, "step": 9984 }, { "epoch": 0.19972, "grad_norm": 2.015625, "grad_norm_var": 0.008125813802083333, "learning_rate": 0.0001, "loss": 3.9804, "loss/crossentropy": 1.8942558765411377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20588286221027374, "step": 9986 }, { "epoch": 0.19976, "grad_norm": 1.9375, "grad_norm_var": 0.012188466389973958, "learning_rate": 0.0001, "loss": 4.109, "loss/crossentropy": 2.220746397972107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144280970096588, "step": 9988 }, { "epoch": 0.1998, "grad_norm": 2.125, "grad_norm_var": 0.01260986328125, "learning_rate": 0.0001, "loss": 4.2279, "loss/crossentropy": 1.9511706233024597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1998429372906685, "step": 9990 }, { "epoch": 0.19984, "grad_norm": 2.125, "grad_norm_var": 0.012247467041015625, "learning_rate": 0.0001, "loss": 4.143, "loss/crossentropy": 2.249726891517639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24011528491973877, "step": 9992 }, { "epoch": 0.19988, "grad_norm": 2.6875, "grad_norm_var": 0.03792292277018229, "learning_rate": 0.0001, "loss": 4.3104, "loss/crossentropy": 1.957375943660736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24464774131774902, "step": 9994 }, { "epoch": 0.19992, "grad_norm": 2.21875, "grad_norm_var": 0.039033762613932294, "learning_rate": 0.0001, "loss": 4.5492, "loss/crossentropy": 2.265984058380127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24291887879371643, "step": 9996 }, { "epoch": 0.19996, "grad_norm": 2.046875, "grad_norm_var": 0.03920873006184896, "learning_rate": 0.0001, "loss": 4.1432, "loss/crossentropy": 1.8064668774604797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21571090072393417, "step": 9998 }, { "epoch": 0.2, "grad_norm": 2.234375, "grad_norm_var": 0.03875732421875, "learning_rate": 0.0001, "loss": 4.2295, "loss/crossentropy": 1.9072380661964417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19980185478925705, "step": 10000 } ], "logging_steps": 2, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.04147655327744e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }